1 | use std::borrow::Cow; |
2 | use std::collections::HashMap; |
3 | use std::fmt; |
4 | use std::iter::FusedIterator; |
5 | use std::ops::{Index, Range}; |
6 | use std::str::FromStr; |
7 | use std::sync::Arc; |
8 | |
9 | use crate::find_byte::find_byte; |
10 | |
11 | use crate::error::Error; |
12 | use crate::exec::{Exec, ExecNoSync}; |
13 | use crate::expand::expand_bytes; |
14 | use crate::re_builder::bytes::RegexBuilder; |
15 | use crate::re_trait::{self, RegularExpression, SubCapturesPosIter}; |
16 | |
17 | /// Match represents a single match of a regex in a haystack. |
18 | /// |
19 | /// The lifetime parameter `'t` refers to the lifetime of the matched text. |
20 | #[derive (Copy, Clone, Eq, PartialEq)] |
21 | pub struct Match<'t> { |
22 | text: &'t [u8], |
23 | start: usize, |
24 | end: usize, |
25 | } |
26 | |
27 | impl<'t> Match<'t> { |
28 | /// Returns the starting byte offset of the match in the haystack. |
29 | #[inline ] |
30 | pub fn start(&self) -> usize { |
31 | self.start |
32 | } |
33 | |
34 | /// Returns the ending byte offset of the match in the haystack. |
35 | #[inline ] |
36 | pub fn end(&self) -> usize { |
37 | self.end |
38 | } |
39 | |
40 | /// Returns true if and only if this match has a length of zero. |
41 | #[inline ] |
42 | pub fn is_empty(&self) -> bool { |
43 | self.start == self.end |
44 | } |
45 | |
46 | /// Returns the length, in bytes, of this match. |
47 | #[inline ] |
48 | pub fn len(&self) -> usize { |
49 | self.end - self.start |
50 | } |
51 | |
52 | /// Returns the range over the starting and ending byte offsets of the |
53 | /// match in the haystack. |
54 | #[inline ] |
55 | pub fn range(&self) -> Range<usize> { |
56 | self.start..self.end |
57 | } |
58 | |
59 | /// Returns the matched text. |
60 | #[inline ] |
61 | pub fn as_bytes(&self) -> &'t [u8] { |
62 | &self.text[self.range()] |
63 | } |
64 | |
65 | /// Creates a new match from the given haystack and byte offsets. |
66 | #[inline ] |
67 | fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> { |
68 | Match { text: haystack, start, end } |
69 | } |
70 | } |
71 | |
72 | impl<'t> std::fmt::Debug for Match<'t> { |
73 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
74 | let mut fmt: DebugStruct<'_, '_> = f.debug_struct(name:"Match" ); |
75 | fmt.field("start" , &self.start).field(name:"end" , &self.end); |
76 | if let Ok(s: &str) = std::str::from_utf8(self.as_bytes()) { |
77 | fmt.field(name:"bytes" , &s); |
78 | } else { |
79 | // FIXME: It would be nice if this could be printed as a string |
80 | // with invalid UTF-8 replaced with hex escapes. A alloc would |
81 | // probably okay if that makes it easier, but regex-automata does |
82 | // (at time of writing) have internal routines that do this. So |
83 | // maybe we should expose them. |
84 | fmt.field(name:"bytes" , &self.as_bytes()); |
85 | } |
86 | fmt.finish() |
87 | } |
88 | } |
89 | |
90 | impl<'t> From<Match<'t>> for Range<usize> { |
91 | fn from(m: Match<'t>) -> Range<usize> { |
92 | m.range() |
93 | } |
94 | } |
95 | |
96 | /// A compiled regular expression for matching arbitrary bytes. |
97 | /// |
98 | /// It can be used to search, split or replace text. All searching is done with |
99 | /// an implicit `.*?` at the beginning and end of an expression. To force an |
100 | /// expression to match the whole string (or a prefix or a suffix), you must |
101 | /// use an anchor like `^` or `$` (or `\A` and `\z`). |
102 | /// |
103 | /// Like the `Regex` type in the parent module, matches with this regex return |
104 | /// byte offsets into the search text. **Unlike** the parent `Regex` type, |
105 | /// these byte offsets may not correspond to UTF-8 sequence boundaries since |
106 | /// the regexes in this module can match arbitrary bytes. |
107 | #[derive (Clone)] |
108 | pub struct Regex(Exec); |
109 | |
110 | impl fmt::Display for Regex { |
111 | /// Shows the original regular expression. |
112 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
113 | write!(f, " {}" , self.as_str()) |
114 | } |
115 | } |
116 | |
117 | impl fmt::Debug for Regex { |
118 | /// Shows the original regular expression. |
119 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
120 | fmt::Display::fmt(self, f) |
121 | } |
122 | } |
123 | |
124 | /// A constructor for Regex from an Exec. |
125 | /// |
126 | /// This is hidden because Exec isn't actually part of the public API. |
127 | #[doc (hidden)] |
128 | impl From<Exec> for Regex { |
129 | fn from(exec: Exec) -> Regex { |
130 | Regex(exec) |
131 | } |
132 | } |
133 | |
134 | impl FromStr for Regex { |
135 | type Err = Error; |
136 | |
137 | /// Attempts to parse a string into a regular expression |
138 | fn from_str(s: &str) -> Result<Regex, Error> { |
139 | Regex::new(re:s) |
140 | } |
141 | } |
142 | |
143 | /// Core regular expression methods. |
144 | impl Regex { |
145 | /// Compiles a regular expression. Once compiled, it can be used repeatedly |
146 | /// to search, split or replace text in a string. |
147 | /// |
148 | /// If an invalid expression is given, then an error is returned. |
149 | pub fn new(re: &str) -> Result<Regex, Error> { |
150 | RegexBuilder::new(re).build() |
151 | } |
152 | |
153 | /// Returns true if and only if there is a match for the regex in the |
154 | /// string given. |
155 | /// |
156 | /// It is recommended to use this method if all you need to do is test |
157 | /// a match, since the underlying matching engine may be able to do less |
158 | /// work. |
159 | /// |
160 | /// # Example |
161 | /// |
162 | /// Test if some text contains at least one word with exactly 13 ASCII word |
163 | /// bytes: |
164 | /// |
165 | /// ```rust |
166 | /// # use regex::bytes::Regex; |
167 | /// # fn main() { |
168 | /// let text = b"I categorically deny having triskaidekaphobia." ; |
169 | /// assert!(Regex::new(r"\b\w{13}\b" ).unwrap().is_match(text)); |
170 | /// # } |
171 | /// ``` |
172 | pub fn is_match(&self, text: &[u8]) -> bool { |
173 | self.is_match_at(text, 0) |
174 | } |
175 | |
176 | /// Returns the start and end byte range of the leftmost-first match in |
177 | /// `text`. If no match exists, then `None` is returned. |
178 | /// |
179 | /// Note that this should only be used if you want to discover the position |
180 | /// of the match. Testing the existence of a match is faster if you use |
181 | /// `is_match`. |
182 | /// |
183 | /// # Example |
184 | /// |
185 | /// Find the start and end location of the first word with exactly 13 |
186 | /// ASCII word bytes: |
187 | /// |
188 | /// ```rust |
189 | /// # use regex::bytes::Regex; |
190 | /// # fn main() { |
191 | /// let text = b"I categorically deny having triskaidekaphobia." ; |
192 | /// let mat = Regex::new(r"\b\w{13}\b" ).unwrap().find(text).unwrap(); |
193 | /// assert_eq!((mat.start(), mat.end()), (2, 15)); |
194 | /// # } |
195 | /// ``` |
196 | pub fn find<'t>(&self, text: &'t [u8]) -> Option<Match<'t>> { |
197 | self.find_at(text, 0) |
198 | } |
199 | |
200 | /// Returns an iterator for each successive non-overlapping match in |
201 | /// `text`, returning the start and end byte indices with respect to |
202 | /// `text`. |
203 | /// |
204 | /// # Example |
205 | /// |
206 | /// Find the start and end location of every word with exactly 13 ASCII |
207 | /// word bytes: |
208 | /// |
209 | /// ```rust |
210 | /// # use regex::bytes::Regex; |
211 | /// # fn main() { |
212 | /// let text = b"Retroactively relinquishing remunerations is reprehensible." ; |
213 | /// for mat in Regex::new(r"\b\w{13}\b" ).unwrap().find_iter(text) { |
214 | /// println!("{:?}" , mat); |
215 | /// } |
216 | /// # } |
217 | /// ``` |
218 | pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> Matches<'r, 't> { |
219 | Matches(self.0.searcher().find_iter(text)) |
220 | } |
221 | |
222 | /// Returns the capture groups corresponding to the leftmost-first |
223 | /// match in `text`. Capture group `0` always corresponds to the entire |
224 | /// match. If no match is found, then `None` is returned. |
225 | /// |
226 | /// You should only use `captures` if you need access to the location of |
227 | /// capturing group matches. Otherwise, `find` is faster for discovering |
228 | /// the location of the overall match. |
229 | /// |
230 | /// # Examples |
231 | /// |
232 | /// Say you have some text with movie names and their release years, |
233 | /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text |
234 | /// looking like that, while also extracting the movie name and its release |
235 | /// year separately. |
236 | /// |
237 | /// ```rust |
238 | /// # use regex::bytes::Regex; |
239 | /// # fn main() { |
240 | /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)" ).unwrap(); |
241 | /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)." ; |
242 | /// let caps = re.captures(text).unwrap(); |
243 | /// assert_eq!(caps.get(1).unwrap().as_bytes(), &b"Citizen Kane" [..]); |
244 | /// assert_eq!(caps.get(2).unwrap().as_bytes(), &b"1941" [..]); |
245 | /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)" [..]); |
246 | /// // You can also access the groups by index using the Index notation. |
247 | /// // Note that this will panic on an invalid index. |
248 | /// assert_eq!(&caps[1], b"Citizen Kane" ); |
249 | /// assert_eq!(&caps[2], b"1941" ); |
250 | /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)" ); |
251 | /// # } |
252 | /// ``` |
253 | /// |
254 | /// Note that the full match is at capture group `0`. Each subsequent |
255 | /// capture group is indexed by the order of its opening `(`. |
256 | /// |
257 | /// We can make this example a bit clearer by using *named* capture groups: |
258 | /// |
259 | /// ```rust |
260 | /// # use regex::bytes::Regex; |
261 | /// # fn main() { |
262 | /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" ) |
263 | /// .unwrap(); |
264 | /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)." ; |
265 | /// let caps = re.captures(text).unwrap(); |
266 | /// assert_eq!(caps.name("title" ).unwrap().as_bytes(), b"Citizen Kane" ); |
267 | /// assert_eq!(caps.name("year" ).unwrap().as_bytes(), b"1941" ); |
268 | /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)" [..]); |
269 | /// // You can also access the groups by name using the Index notation. |
270 | /// // Note that this will panic on an invalid group name. |
271 | /// assert_eq!(&caps["title" ], b"Citizen Kane" ); |
272 | /// assert_eq!(&caps["year" ], b"1941" ); |
273 | /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)" ); |
274 | /// |
275 | /// # } |
276 | /// ``` |
277 | /// |
278 | /// Here we name the capture groups, which we can access with the `name` |
279 | /// method or the `Index` notation with a `&str`. Note that the named |
280 | /// capture groups are still accessible with `get` or the `Index` notation |
281 | /// with a `usize`. |
282 | /// |
283 | /// The `0`th capture group is always unnamed, so it must always be |
284 | /// accessed with `get(0)` or `[0]`. |
285 | pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> { |
286 | self.captures_at(text, 0) |
287 | } |
288 | |
289 | /// Returns an iterator over all the non-overlapping capture groups matched |
290 | /// in `text`. This is operationally the same as `find_iter`, except it |
291 | /// yields information about capturing group matches. |
292 | /// |
293 | /// # Example |
294 | /// |
295 | /// We can use this to find all movie titles and their release years in |
296 | /// some text, where the movie is formatted like "'Title' (xxxx)": |
297 | /// |
298 | /// ```rust |
299 | /// # use std::str; use regex::bytes::Regex; |
300 | /// # fn main() { |
301 | /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" ) |
302 | /// .unwrap(); |
303 | /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)." ; |
304 | /// for caps in re.captures_iter(text) { |
305 | /// let title = str::from_utf8(&caps["title" ]).unwrap(); |
306 | /// let year = str::from_utf8(&caps["year" ]).unwrap(); |
307 | /// println!("Movie: {:?}, Released: {:?}" , title, year); |
308 | /// } |
309 | /// // Output: |
310 | /// // Movie: Citizen Kane, Released: 1941 |
311 | /// // Movie: The Wizard of Oz, Released: 1939 |
312 | /// // Movie: M, Released: 1931 |
313 | /// # } |
314 | /// ``` |
315 | pub fn captures_iter<'r, 't>( |
316 | &'r self, |
317 | text: &'t [u8], |
318 | ) -> CaptureMatches<'r, 't> { |
319 | CaptureMatches(self.0.searcher().captures_iter(text)) |
320 | } |
321 | |
322 | /// Returns an iterator of substrings of `text` delimited by a match of the |
323 | /// regular expression. Namely, each element of the iterator corresponds to |
324 | /// text that *isn't* matched by the regular expression. |
325 | /// |
326 | /// This method will *not* copy the text given. |
327 | /// |
328 | /// # Example |
329 | /// |
330 | /// To split a string delimited by arbitrary amounts of spaces or tabs: |
331 | /// |
332 | /// ```rust |
333 | /// # use regex::bytes::Regex; |
334 | /// # fn main() { |
335 | /// let re = Regex::new(r"[ \t]+" ).unwrap(); |
336 | /// let fields: Vec<&[u8]> = re.split(b"a b \t c \td e" ).collect(); |
337 | /// assert_eq!(fields, vec![ |
338 | /// &b"a" [..], &b"b" [..], &b"c" [..], &b"d" [..], &b"e" [..], |
339 | /// ]); |
340 | /// # } |
341 | /// ``` |
342 | pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> { |
343 | Split { finder: self.find_iter(text), last: 0 } |
344 | } |
345 | |
346 | /// Returns an iterator of at most `limit` substrings of `text` delimited |
347 | /// by a match of the regular expression. (A `limit` of `0` will return no |
348 | /// substrings.) Namely, each element of the iterator corresponds to text |
349 | /// that *isn't* matched by the regular expression. The remainder of the |
350 | /// string that is not split will be the last element in the iterator. |
351 | /// |
352 | /// This method will *not* copy the text given. |
353 | /// |
354 | /// # Example |
355 | /// |
356 | /// Get the first two words in some text: |
357 | /// |
358 | /// ```rust |
359 | /// # use regex::bytes::Regex; |
360 | /// # fn main() { |
361 | /// let re = Regex::new(r"\W+" ).unwrap(); |
362 | /// let fields: Vec<&[u8]> = re.splitn(b"Hey! How are you?" , 3).collect(); |
363 | /// assert_eq!(fields, vec![&b"Hey" [..], &b"How" [..], &b"are you?" [..]]); |
364 | /// # } |
365 | /// ``` |
366 | pub fn splitn<'r, 't>( |
367 | &'r self, |
368 | text: &'t [u8], |
369 | limit: usize, |
370 | ) -> SplitN<'r, 't> { |
371 | SplitN { splits: self.split(text), n: limit } |
372 | } |
373 | |
374 | /// Replaces the leftmost-first match with the replacement provided. The |
375 | /// replacement can be a regular byte string (where `$N` and `$name` are |
376 | /// expanded to match capture groups) or a function that takes the matches' |
377 | /// `Captures` and returns the replaced byte string. |
378 | /// |
379 | /// If no match is found, then a copy of the byte string is returned |
380 | /// unchanged. |
381 | /// |
382 | /// # Replacement string syntax |
383 | /// |
384 | /// All instances of `$name` in the replacement text is replaced with the |
385 | /// corresponding capture group `name`. |
386 | /// |
387 | /// `name` may be an integer corresponding to the index of the |
388 | /// capture group (counted by order of opening parenthesis where `0` is the |
389 | /// entire match) or it can be a name (consisting of letters, digits or |
390 | /// underscores) corresponding to a named capture group. |
391 | /// |
392 | /// If `name` isn't a valid capture group (whether the name doesn't exist |
393 | /// or isn't a valid index), then it is replaced with the empty string. |
394 | /// |
395 | /// The longest possible name is used. e.g., `$1a` looks up the capture |
396 | /// group named `1a` and not the capture group at index `1`. To exert more |
397 | /// precise control over the name, use braces, e.g., `${1}a`. |
398 | /// |
399 | /// To write a literal `$` use `$$`. |
400 | /// |
401 | /// # Examples |
402 | /// |
403 | /// Note that this function is polymorphic with respect to the replacement. |
404 | /// In typical usage, this can just be a normal byte string: |
405 | /// |
406 | /// ```rust |
407 | /// # use regex::bytes::Regex; |
408 | /// # fn main() { |
409 | /// let re = Regex::new("[^01]+" ).unwrap(); |
410 | /// assert_eq!(re.replace(b"1078910" , &b"" [..]), &b"1010" [..]); |
411 | /// # } |
412 | /// ``` |
413 | /// |
414 | /// But anything satisfying the `Replacer` trait will work. For example, a |
415 | /// closure of type `|&Captures| -> Vec<u8>` provides direct access to the |
416 | /// captures corresponding to a match. This allows one to access capturing |
417 | /// group matches easily: |
418 | /// |
419 | /// ```rust |
420 | /// # use regex::bytes::Regex; |
421 | /// # use regex::bytes::Captures; fn main() { |
422 | /// let re = Regex::new(r"([^,\s]+),\s+(\S+)" ).unwrap(); |
423 | /// let result = re.replace(b"Springsteen, Bruce" , |caps: &Captures| { |
424 | /// let mut replacement = caps[2].to_owned(); |
425 | /// replacement.push(b' ' ); |
426 | /// replacement.extend(&caps[1]); |
427 | /// replacement |
428 | /// }); |
429 | /// assert_eq!(result, &b"Bruce Springsteen" [..]); |
430 | /// # } |
431 | /// ``` |
432 | /// |
433 | /// But this is a bit cumbersome to use all the time. Instead, a simple |
434 | /// syntax is supported that expands `$name` into the corresponding capture |
435 | /// group. Here's the last example, but using this expansion technique |
436 | /// with named capture groups: |
437 | /// |
438 | /// ```rust |
439 | /// # use regex::bytes::Regex; |
440 | /// # fn main() { |
441 | /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)" ).unwrap(); |
442 | /// let result = re.replace(b"Springsteen, Bruce" , &b"$first $last" [..]); |
443 | /// assert_eq!(result, &b"Bruce Springsteen" [..]); |
444 | /// # } |
445 | /// ``` |
446 | /// |
447 | /// Note that using `$2` instead of `$first` or `$1` instead of `$last` |
448 | /// would produce the same result. To write a literal `$` use `$$`. |
449 | /// |
450 | /// Sometimes the replacement string requires use of curly braces to |
451 | /// delineate a capture group replacement and surrounding literal text. |
452 | /// For example, if we wanted to join two words together with an |
453 | /// underscore: |
454 | /// |
455 | /// ```rust |
456 | /// # use regex::bytes::Regex; |
457 | /// # fn main() { |
458 | /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)" ).unwrap(); |
459 | /// let result = re.replace(b"deep fried" , &b"${first}_$second" [..]); |
460 | /// assert_eq!(result, &b"deep_fried" [..]); |
461 | /// # } |
462 | /// ``` |
463 | /// |
464 | /// Without the curly braces, the capture group name `first_` would be |
465 | /// used, and since it doesn't exist, it would be replaced with the empty |
466 | /// string. |
467 | /// |
468 | /// Finally, sometimes you just want to replace a literal string with no |
469 | /// regard for capturing group expansion. This can be done by wrapping a |
470 | /// byte string with `NoExpand`: |
471 | /// |
472 | /// ```rust |
473 | /// # use regex::bytes::Regex; |
474 | /// # fn main() { |
475 | /// use regex::bytes::NoExpand; |
476 | /// |
477 | /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)" ).unwrap(); |
478 | /// let result = re.replace(b"Springsteen, Bruce" , NoExpand(b"$2 $last" )); |
479 | /// assert_eq!(result, &b"$2 $last" [..]); |
480 | /// # } |
481 | /// ``` |
482 | pub fn replace<'t, R: Replacer>( |
483 | &self, |
484 | text: &'t [u8], |
485 | rep: R, |
486 | ) -> Cow<'t, [u8]> { |
487 | self.replacen(text, 1, rep) |
488 | } |
489 | |
490 | /// Replaces all non-overlapping matches in `text` with the replacement |
491 | /// provided. This is the same as calling `replacen` with `limit` set to |
492 | /// `0`. |
493 | /// |
494 | /// See the documentation for `replace` for details on how to access |
495 | /// capturing group matches in the replacement text. |
496 | pub fn replace_all<'t, R: Replacer>( |
497 | &self, |
498 | text: &'t [u8], |
499 | rep: R, |
500 | ) -> Cow<'t, [u8]> { |
501 | self.replacen(text, 0, rep) |
502 | } |
503 | |
504 | /// Replaces at most `limit` non-overlapping matches in `text` with the |
505 | /// replacement provided. If `limit` is 0, then all non-overlapping matches |
506 | /// are replaced. |
507 | /// |
508 | /// See the documentation for `replace` for details on how to access |
509 | /// capturing group matches in the replacement text. |
510 | pub fn replacen<'t, R: Replacer>( |
511 | &self, |
512 | text: &'t [u8], |
513 | limit: usize, |
514 | mut rep: R, |
515 | ) -> Cow<'t, [u8]> { |
516 | if let Some(rep) = rep.no_expansion() { |
517 | let mut it = self.find_iter(text).enumerate().peekable(); |
518 | if it.peek().is_none() { |
519 | return Cow::Borrowed(text); |
520 | } |
521 | let mut new = Vec::with_capacity(text.len()); |
522 | let mut last_match = 0; |
523 | for (i, m) in it { |
524 | new.extend_from_slice(&text[last_match..m.start()]); |
525 | new.extend_from_slice(&rep); |
526 | last_match = m.end(); |
527 | if limit > 0 && i >= limit - 1 { |
528 | break; |
529 | } |
530 | } |
531 | new.extend_from_slice(&text[last_match..]); |
532 | return Cow::Owned(new); |
533 | } |
534 | |
535 | // The slower path, which we use if the replacement needs access to |
536 | // capture groups. |
537 | let mut it = self.captures_iter(text).enumerate().peekable(); |
538 | if it.peek().is_none() { |
539 | return Cow::Borrowed(text); |
540 | } |
541 | let mut new = Vec::with_capacity(text.len()); |
542 | let mut last_match = 0; |
543 | for (i, cap) in it { |
544 | // unwrap on 0 is OK because captures only reports matches |
545 | let m = cap.get(0).unwrap(); |
546 | new.extend_from_slice(&text[last_match..m.start()]); |
547 | rep.replace_append(&cap, &mut new); |
548 | last_match = m.end(); |
549 | if limit > 0 && i >= limit - 1 { |
550 | break; |
551 | } |
552 | } |
553 | new.extend_from_slice(&text[last_match..]); |
554 | Cow::Owned(new) |
555 | } |
556 | } |
557 | |
558 | /// Advanced or "lower level" search methods. |
559 | impl Regex { |
560 | /// Returns the end location of a match in the text given. |
561 | /// |
562 | /// This method may have the same performance characteristics as |
563 | /// `is_match`, except it provides an end location for a match. In |
564 | /// particular, the location returned *may be shorter* than the proper end |
565 | /// of the leftmost-first match that you would find via `Regex::find`. |
566 | /// |
567 | /// Note that it is not guaranteed that this routine finds the shortest or |
568 | /// "earliest" possible match. Instead, the main idea of this API is that |
569 | /// it returns the offset at the point at which the internal regex engine |
570 | /// has determined that a match has occurred. This may vary depending on |
571 | /// which internal regex engine is used, and thus, the offset itself may |
572 | /// change. |
573 | /// |
574 | /// # Example |
575 | /// |
576 | /// Typically, `a+` would match the entire first sequence of `a` in some |
577 | /// text, but `shortest_match` can give up as soon as it sees the first |
578 | /// `a`. |
579 | /// |
580 | /// ```rust |
581 | /// # use regex::bytes::Regex; |
582 | /// # fn main() { |
583 | /// let text = b"aaaaa" ; |
584 | /// let pos = Regex::new(r"a+" ).unwrap().shortest_match(text); |
585 | /// assert_eq!(pos, Some(1)); |
586 | /// # } |
587 | /// ``` |
588 | pub fn shortest_match(&self, text: &[u8]) -> Option<usize> { |
589 | self.shortest_match_at(text, 0) |
590 | } |
591 | |
592 | /// Returns the same as shortest_match, but starts the search at the given |
593 | /// offset. |
594 | /// |
595 | /// The significance of the starting point is that it takes the surrounding |
596 | /// context into consideration. For example, the `\A` anchor can only |
597 | /// match when `start == 0`. |
598 | pub fn shortest_match_at( |
599 | &self, |
600 | text: &[u8], |
601 | start: usize, |
602 | ) -> Option<usize> { |
603 | self.0.searcher().shortest_match_at(text, start) |
604 | } |
605 | |
606 | /// Returns the same as is_match, but starts the search at the given |
607 | /// offset. |
608 | /// |
609 | /// The significance of the starting point is that it takes the surrounding |
610 | /// context into consideration. For example, the `\A` anchor can only |
611 | /// match when `start == 0`. |
612 | pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { |
613 | self.0.searcher().is_match_at(text, start) |
614 | } |
615 | |
616 | /// Returns the same as find, but starts the search at the given |
617 | /// offset. |
618 | /// |
619 | /// The significance of the starting point is that it takes the surrounding |
620 | /// context into consideration. For example, the `\A` anchor can only |
621 | /// match when `start == 0`. |
622 | pub fn find_at<'t>( |
623 | &self, |
624 | text: &'t [u8], |
625 | start: usize, |
626 | ) -> Option<Match<'t>> { |
627 | self.0 |
628 | .searcher() |
629 | .find_at(text, start) |
630 | .map(|(s, e)| Match::new(text, s, e)) |
631 | } |
632 | |
633 | /// Returns the same as [`Regex::captures`], but starts the search at the |
634 | /// given offset. |
635 | /// |
636 | /// The significance of the starting point is that it takes the surrounding |
637 | /// context into consideration. For example, the `\A` anchor can only |
638 | /// match when `start == 0`. |
639 | pub fn captures_at<'t>( |
640 | &self, |
641 | text: &'t [u8], |
642 | start: usize, |
643 | ) -> Option<Captures<'t>> { |
644 | let mut locs = self.capture_locations(); |
645 | self.captures_read_at(&mut locs, text, start).map(move |_| Captures { |
646 | text, |
647 | locs: locs.0, |
648 | named_groups: self.0.capture_name_idx().clone(), |
649 | }) |
650 | } |
651 | |
652 | /// This is like `captures`, but uses |
653 | /// [`CaptureLocations`](struct.CaptureLocations.html) |
654 | /// instead of |
655 | /// [`Captures`](struct.Captures.html) in order to amortize allocations. |
656 | /// |
657 | /// To create a `CaptureLocations` value, use the |
658 | /// `Regex::capture_locations` method. |
659 | /// |
660 | /// This returns the overall match if this was successful, which is always |
661 | /// equivalence to the `0`th capture group. |
662 | pub fn captures_read<'t>( |
663 | &self, |
664 | locs: &mut CaptureLocations, |
665 | text: &'t [u8], |
666 | ) -> Option<Match<'t>> { |
667 | self.captures_read_at(locs, text, 0) |
668 | } |
669 | |
670 | /// Returns the same as `captures_read`, but starts the search at the given |
671 | /// offset and populates the capture locations given. |
672 | /// |
673 | /// The significance of the starting point is that it takes the surrounding |
674 | /// context into consideration. For example, the `\A` anchor can only |
675 | /// match when `start == 0`. |
676 | pub fn captures_read_at<'t>( |
677 | &self, |
678 | locs: &mut CaptureLocations, |
679 | text: &'t [u8], |
680 | start: usize, |
681 | ) -> Option<Match<'t>> { |
682 | self.0 |
683 | .searcher() |
684 | .captures_read_at(&mut locs.0, text, start) |
685 | .map(|(s, e)| Match::new(text, s, e)) |
686 | } |
687 | |
688 | /// An undocumented alias for `captures_read_at`. |
689 | /// |
690 | /// The `regex-capi` crate previously used this routine, so to avoid |
691 | /// breaking that crate, we continue to provide the name as an undocumented |
692 | /// alias. |
693 | #[doc (hidden)] |
694 | pub fn read_captures_at<'t>( |
695 | &self, |
696 | locs: &mut CaptureLocations, |
697 | text: &'t [u8], |
698 | start: usize, |
699 | ) -> Option<Match<'t>> { |
700 | self.captures_read_at(locs, text, start) |
701 | } |
702 | } |
703 | |
704 | /// Auxiliary methods. |
705 | impl Regex { |
706 | /// Returns the original string of this regex. |
707 | pub fn as_str(&self) -> &str { |
708 | &self.0.regex_strings()[0] |
709 | } |
710 | |
711 | /// Returns an iterator over the capture names. |
712 | pub fn capture_names(&self) -> CaptureNames<'_> { |
713 | CaptureNames(self.0.capture_names().iter()) |
714 | } |
715 | |
716 | /// Returns the number of captures. |
717 | pub fn captures_len(&self) -> usize { |
718 | self.0.capture_names().len() |
719 | } |
720 | |
721 | /// Returns the total number of capturing groups that appear in every |
722 | /// possible match. |
723 | /// |
724 | /// If the number of capture groups can vary depending on the match, then |
725 | /// this returns `None`. That is, a value is only returned when the number |
726 | /// of matching groups is invariant or "static." |
727 | /// |
728 | /// Note that like [`Regex::captures_len`], this **does** include the |
729 | /// implicit capturing group corresponding to the entire match. Therefore, |
730 | /// when a non-None value is returned, it is guaranteed to be at least `1`. |
731 | /// Stated differently, a return value of `Some(0)` is impossible. |
732 | /// |
733 | /// # Example |
734 | /// |
735 | /// This shows a few cases where a static number of capture groups is |
736 | /// available and a few cases where it is not. |
737 | /// |
738 | /// ``` |
739 | /// use regex::bytes::Regex; |
740 | /// |
741 | /// let len = |pattern| { |
742 | /// Regex::new(pattern).map(|re| re.static_captures_len()) |
743 | /// }; |
744 | /// |
745 | /// assert_eq!(Some(1), len("a" )?); |
746 | /// assert_eq!(Some(2), len("(a)" )?); |
747 | /// assert_eq!(Some(2), len("(a)|(b)" )?); |
748 | /// assert_eq!(Some(3), len("(a)(b)|(c)(d)" )?); |
749 | /// assert_eq!(None, len("(a)|b" )?); |
750 | /// assert_eq!(None, len("a|(b)" )?); |
751 | /// assert_eq!(None, len("(b)*" )?); |
752 | /// assert_eq!(Some(2), len("(b)+" )?); |
753 | /// |
754 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
755 | /// ``` |
756 | #[inline ] |
757 | pub fn static_captures_len(&self) -> Option<usize> { |
758 | self.0.static_captures_len().map(|len| len.saturating_add(1)) |
759 | } |
760 | |
761 | /// Returns an empty set of capture locations that can be reused in |
762 | /// multiple calls to `captures_read` or `captures_read_at`. |
763 | pub fn capture_locations(&self) -> CaptureLocations { |
764 | CaptureLocations(self.0.searcher().locations()) |
765 | } |
766 | |
767 | /// An alias for `capture_locations` to preserve backward compatibility. |
768 | /// |
769 | /// The `regex-capi` crate uses this method, so to avoid breaking that |
770 | /// crate, we continue to export it as an undocumented API. |
771 | #[doc (hidden)] |
772 | pub fn locations(&self) -> CaptureLocations { |
773 | CaptureLocations(self.0.searcher().locations()) |
774 | } |
775 | } |
776 | |
777 | /// An iterator over all non-overlapping matches for a particular string. |
778 | /// |
779 | /// The iterator yields a tuple of integers corresponding to the start and end |
780 | /// of the match. The indices are byte offsets. The iterator stops when no more |
781 | /// matches can be found. |
782 | /// |
783 | /// `'r` is the lifetime of the compiled regular expression and `'t` is the |
784 | /// lifetime of the matched byte string. |
785 | #[derive (Debug)] |
786 | pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSync<'r>>); |
787 | |
788 | impl<'r, 't> Iterator for Matches<'r, 't> { |
789 | type Item = Match<'t>; |
790 | |
791 | fn next(&mut self) -> Option<Match<'t>> { |
792 | let text: &[u8] = self.0.text(); |
793 | self.0.next().map(|(s: usize, e: usize)| Match::new(haystack:text, start:s, end:e)) |
794 | } |
795 | } |
796 | |
797 | impl<'r, 't> FusedIterator for Matches<'r, 't> {} |
798 | |
799 | /// An iterator that yields all non-overlapping capture groups matching a |
800 | /// particular regular expression. |
801 | /// |
802 | /// The iterator stops when no more matches can be found. |
803 | /// |
804 | /// `'r` is the lifetime of the compiled regular expression and `'t` is the |
805 | /// lifetime of the matched byte string. |
806 | #[derive (Debug)] |
807 | pub struct CaptureMatches<'r, 't>( |
808 | re_trait::CaptureMatches<'t, ExecNoSync<'r>>, |
809 | ); |
810 | |
811 | impl<'r, 't> Iterator for CaptureMatches<'r, 't> { |
812 | type Item = Captures<'t>; |
813 | |
814 | fn next(&mut self) -> Option<Captures<'t>> { |
815 | self.0.next().map(|locs: Locations| Captures { |
816 | text: self.0.text(), |
817 | locs, |
818 | named_groups: self.0.regex().capture_name_idx().clone(), |
819 | }) |
820 | } |
821 | } |
822 | |
823 | impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {} |
824 | |
825 | /// Yields all substrings delimited by a regular expression match. |
826 | /// |
827 | /// `'r` is the lifetime of the compiled regular expression and `'t` is the |
828 | /// lifetime of the byte string being split. |
829 | #[derive (Debug)] |
830 | pub struct Split<'r, 't> { |
831 | finder: Matches<'r, 't>, |
832 | last: usize, |
833 | } |
834 | |
835 | impl<'r, 't> Iterator for Split<'r, 't> { |
836 | type Item = &'t [u8]; |
837 | |
838 | fn next(&mut self) -> Option<&'t [u8]> { |
839 | let text: &[u8] = self.finder.0.text(); |
840 | match self.finder.next() { |
841 | None => { |
842 | if self.last > text.len() { |
843 | None |
844 | } else { |
845 | let s: &[u8] = &text[self.last..]; |
846 | self.last = text.len() + 1; // Next call will return None |
847 | Some(s) |
848 | } |
849 | } |
850 | Some(m: Match<'_>) => { |
851 | let matched: &[u8] = &text[self.last..m.start()]; |
852 | self.last = m.end(); |
853 | Some(matched) |
854 | } |
855 | } |
856 | } |
857 | } |
858 | |
859 | impl<'r, 't> FusedIterator for Split<'r, 't> {} |
860 | |
861 | /// Yields at most `N` substrings delimited by a regular expression match. |
862 | /// |
863 | /// The last substring will be whatever remains after splitting. |
864 | /// |
865 | /// `'r` is the lifetime of the compiled regular expression and `'t` is the |
866 | /// lifetime of the byte string being split. |
867 | #[derive (Debug)] |
868 | pub struct SplitN<'r, 't> { |
869 | splits: Split<'r, 't>, |
870 | n: usize, |
871 | } |
872 | |
873 | impl<'r, 't> Iterator for SplitN<'r, 't> { |
874 | type Item = &'t [u8]; |
875 | |
876 | fn next(&mut self) -> Option<&'t [u8]> { |
877 | if self.n == 0 { |
878 | return None; |
879 | } |
880 | |
881 | self.n -= 1; |
882 | if self.n > 0 { |
883 | return self.splits.next(); |
884 | } |
885 | |
886 | let text = self.splits.finder.0.text(); |
887 | if self.splits.last > text.len() { |
888 | // We've already returned all substrings. |
889 | None |
890 | } else { |
891 | // self.n == 0, so future calls will return None immediately |
892 | Some(&text[self.splits.last..]) |
893 | } |
894 | } |
895 | |
896 | fn size_hint(&self) -> (usize, Option<usize>) { |
897 | (0, Some(self.n)) |
898 | } |
899 | } |
900 | |
901 | impl<'r, 't> FusedIterator for SplitN<'r, 't> {} |
902 | |
903 | /// An iterator over the names of all possible captures. |
904 | /// |
905 | /// `None` indicates an unnamed capture; the first element (capture 0, the |
906 | /// whole matched region) is always unnamed. |
907 | /// |
908 | /// `'r` is the lifetime of the compiled regular expression. |
909 | #[derive (Clone, Debug)] |
910 | pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>); |
911 | |
912 | impl<'r> Iterator for CaptureNames<'r> { |
913 | type Item = Option<&'r str>; |
914 | |
915 | fn next(&mut self) -> Option<Option<&'r str>> { |
916 | self.0 |
917 | .next() |
918 | .as_ref() |
919 | .map(|slot: &&Option| slot.as_ref().map(|name: &String| name.as_ref())) |
920 | } |
921 | |
922 | fn size_hint(&self) -> (usize, Option<usize>) { |
923 | self.0.size_hint() |
924 | } |
925 | |
926 | fn count(self) -> usize { |
927 | self.0.count() |
928 | } |
929 | } |
930 | |
931 | impl<'r> ExactSizeIterator for CaptureNames<'r> {} |
932 | |
933 | impl<'r> FusedIterator for CaptureNames<'r> {} |
934 | |
935 | /// CaptureLocations is a low level representation of the raw offsets of each |
936 | /// submatch. |
937 | /// |
938 | /// You can think of this as a lower level |
939 | /// [`Captures`](struct.Captures.html), where this type does not support |
940 | /// named capturing groups directly and it does not borrow the text that these |
941 | /// offsets were matched on. |
942 | /// |
943 | /// Primarily, this type is useful when using the lower level `Regex` APIs |
944 | /// such as `read_captures`, which permits amortizing the allocation in which |
945 | /// capture match locations are stored. |
946 | /// |
947 | /// In order to build a value of this type, you'll need to call the |
948 | /// `capture_locations` method on the `Regex` being used to execute the search. |
949 | /// The value returned can then be reused in subsequent searches. |
950 | /// |
951 | /// # Example |
952 | /// |
953 | /// This example shows how to create and use `CaptureLocations` in a search. |
954 | /// |
955 | /// ``` |
956 | /// use regex::bytes::Regex; |
957 | /// |
958 | /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)" ).unwrap(); |
959 | /// let mut locs = re.capture_locations(); |
960 | /// let m = re.captures_read(&mut locs, b"Bruce Springsteen" ).unwrap(); |
961 | /// assert_eq!(0..17, m.range()); |
962 | /// assert_eq!(Some((0, 17)), locs.get(0)); |
963 | /// assert_eq!(Some((0, 5)), locs.get(1)); |
964 | /// assert_eq!(Some((6, 17)), locs.get(2)); |
965 | /// |
966 | /// // Asking for an invalid capture group always returns None. |
967 | /// assert_eq!(None, locs.get(3)); |
968 | /// assert_eq!(None, locs.get(34973498648)); |
969 | /// assert_eq!(None, locs.get(9944060567225171988)); |
970 | /// ``` |
971 | #[derive (Clone, Debug)] |
972 | pub struct CaptureLocations(re_trait::Locations); |
973 | |
974 | /// A type alias for `CaptureLocations` for backwards compatibility. |
975 | /// |
976 | /// Previously, we exported `CaptureLocations` as `Locations` in an |
977 | /// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), |
978 | /// we continue re-exporting the same undocumented API. |
979 | #[doc (hidden)] |
980 | pub type Locations = CaptureLocations; |
981 | |
982 | impl CaptureLocations { |
983 | /// Returns the start and end positions of the Nth capture group. Returns |
984 | /// `None` if `i` is not a valid capture group or if the capture group did |
985 | /// not match anything. The positions returned are *always* byte indices |
986 | /// with respect to the original string matched. |
987 | #[inline ] |
988 | pub fn get(&self, i: usize) -> Option<(usize, usize)> { |
989 | self.0.pos(i) |
990 | } |
991 | |
992 | /// Returns the total number of capture groups (even if they didn't match). |
993 | /// |
994 | /// This is always at least `1` since every regex has at least `1` |
995 | /// capturing group that corresponds to the entire match. |
996 | #[inline ] |
997 | pub fn len(&self) -> usize { |
998 | self.0.len() |
999 | } |
1000 | |
1001 | /// An alias for the `get` method for backwards compatibility. |
1002 | /// |
1003 | /// Previously, we exported `get` as `pos` in an undocumented API. To |
1004 | /// prevent breaking that code (e.g., in `regex-capi`), we continue |
1005 | /// re-exporting the same undocumented API. |
1006 | #[doc (hidden)] |
1007 | #[inline ] |
1008 | pub fn pos(&self, i: usize) -> Option<(usize, usize)> { |
1009 | self.get(i) |
1010 | } |
1011 | } |
1012 | |
1013 | /// Captures represents a group of captured byte strings for a single match. |
1014 | /// |
1015 | /// The 0th capture always corresponds to the entire match. Each subsequent |
1016 | /// index corresponds to the next capture group in the regex. If a capture |
1017 | /// group is named, then the matched byte string is *also* available via the |
1018 | /// `name` method. (Note that the 0th capture is always unnamed and so must be |
1019 | /// accessed with the `get` method.) |
1020 | /// |
1021 | /// Positions returned from a capture group are always byte indices. |
1022 | /// |
1023 | /// `'t` is the lifetime of the matched text. |
1024 | pub struct Captures<'t> { |
1025 | text: &'t [u8], |
1026 | locs: re_trait::Locations, |
1027 | named_groups: Arc<HashMap<String, usize>>, |
1028 | } |
1029 | |
1030 | impl<'t> Captures<'t> { |
1031 | /// Returns the match associated with the capture group at index `i`. If |
1032 | /// `i` does not correspond to a capture group, or if the capture group |
1033 | /// did not participate in the match, then `None` is returned. |
1034 | /// |
1035 | /// # Examples |
1036 | /// |
1037 | /// Get the text of the match with a default of an empty string if this |
1038 | /// group didn't participate in the match: |
1039 | /// |
1040 | /// ```rust |
1041 | /// # use regex::bytes::Regex; |
1042 | /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))" ).unwrap(); |
1043 | /// let caps = re.captures(b"abc123" ).unwrap(); |
1044 | /// |
1045 | /// let text1 = caps.get(1).map_or(&b"" [..], |m| m.as_bytes()); |
1046 | /// let text2 = caps.get(2).map_or(&b"" [..], |m| m.as_bytes()); |
1047 | /// assert_eq!(text1, &b"123" [..]); |
1048 | /// assert_eq!(text2, &b"" [..]); |
1049 | /// ``` |
1050 | pub fn get(&self, i: usize) -> Option<Match<'t>> { |
1051 | self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) |
1052 | } |
1053 | |
1054 | /// Returns the match for the capture group named `name`. If `name` isn't a |
1055 | /// valid capture group or didn't match anything, then `None` is returned. |
1056 | pub fn name(&self, name: &str) -> Option<Match<'t>> { |
1057 | self.named_groups.get(name).and_then(|&i| self.get(i)) |
1058 | } |
1059 | |
1060 | /// An iterator that yields all capturing matches in the order in which |
1061 | /// they appear in the regex. If a particular capture group didn't |
1062 | /// participate in the match, then `None` is yielded for that capture. |
1063 | /// |
1064 | /// The first match always corresponds to the overall match of the regex. |
1065 | pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> { |
1066 | SubCaptureMatches { caps: self, it: self.locs.iter() } |
1067 | } |
1068 | |
1069 | /// Expands all instances of `$name` in `replacement` to the corresponding |
1070 | /// capture group `name`, and writes them to the `dst` buffer given. |
1071 | /// |
1072 | /// `name` may be an integer corresponding to the index of the capture |
1073 | /// group (counted by order of opening parenthesis where `0` is the |
1074 | /// entire match) or it can be a name (consisting of letters, digits or |
1075 | /// underscores) corresponding to a named capture group. |
1076 | /// |
1077 | /// If `name` isn't a valid capture group (whether the name doesn't exist |
1078 | /// or isn't a valid index), then it is replaced with the empty string. |
1079 | /// |
1080 | /// The longest possible name consisting of the characters `[_0-9A-Za-z]` |
1081 | /// is used. e.g., `$1a` looks up the capture group named `1a` and not the |
1082 | /// capture group at index `1`. To exert more precise control over the |
1083 | /// name, or to refer to a capture group name that uses characters outside |
1084 | /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When |
1085 | /// using braces, any sequence of valid UTF-8 bytes is permitted. If the |
1086 | /// sequence does not refer to a capture group name in the corresponding |
1087 | /// regex, then it is replaced with an empty string. |
1088 | /// |
1089 | /// To write a literal `$` use `$$`. |
1090 | pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) { |
1091 | expand_bytes(self, replacement, dst) |
1092 | } |
1093 | |
1094 | /// Returns the total number of capture groups (even if they didn't match). |
1095 | /// |
1096 | /// This is always at least `1`, since every regex has at least one capture |
1097 | /// group that corresponds to the full match. |
1098 | #[inline ] |
1099 | pub fn len(&self) -> usize { |
1100 | self.locs.len() |
1101 | } |
1102 | } |
1103 | |
1104 | impl<'t> fmt::Debug for Captures<'t> { |
1105 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1106 | f.debug_tuple(name:"Captures" ).field(&CapturesDebug(self)).finish() |
1107 | } |
1108 | } |
1109 | |
1110 | struct CapturesDebug<'c, 't>(&'c Captures<'t>); |
1111 | |
1112 | impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { |
1113 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1114 | fn escape_bytes(bytes: &[u8]) -> String { |
1115 | let mut s = String::new(); |
1116 | for &b in bytes { |
1117 | s.push_str(&escape_byte(b)); |
1118 | } |
1119 | s |
1120 | } |
1121 | |
1122 | fn escape_byte(byte: u8) -> String { |
1123 | use std::ascii::escape_default; |
1124 | |
1125 | let escaped: Vec<u8> = escape_default(byte).collect(); |
1126 | String::from_utf8_lossy(&escaped).into_owned() |
1127 | } |
1128 | |
1129 | // We'd like to show something nice here, even if it means an |
1130 | // allocation to build a reverse index. |
1131 | let slot_to_name: HashMap<&usize, &String> = |
1132 | self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); |
1133 | let mut map = f.debug_map(); |
1134 | for (slot, m) in self.0.locs.iter().enumerate() { |
1135 | let m = m.map(|(s, e)| escape_bytes(&self.0.text[s..e])); |
1136 | if let Some(name) = slot_to_name.get(&slot) { |
1137 | map.entry(&name, &m); |
1138 | } else { |
1139 | map.entry(&slot, &m); |
1140 | } |
1141 | } |
1142 | map.finish() |
1143 | } |
1144 | } |
1145 | |
1146 | /// Get a group by index. |
1147 | /// |
1148 | /// `'t` is the lifetime of the matched text. |
1149 | /// |
1150 | /// The text can't outlive the `Captures` object if this method is |
1151 | /// used, because of how `Index` is defined (normally `a[i]` is part |
1152 | /// of `a` and can't outlive it); to do that, use `get()` instead. |
1153 | /// |
1154 | /// # Panics |
1155 | /// |
1156 | /// If there is no group at the given index. |
1157 | impl<'t> Index<usize> for Captures<'t> { |
1158 | type Output = [u8]; |
1159 | |
1160 | fn index(&self, i: usize) -> &[u8] { |
1161 | self.get(i) |
1162 | .map(|m: Match<'_>| m.as_bytes()) |
1163 | .unwrap_or_else(|| panic!("no group at index ' {}'" , i)) |
1164 | } |
1165 | } |
1166 | |
1167 | /// Get a group by name. |
1168 | /// |
1169 | /// `'t` is the lifetime of the matched text and `'i` is the lifetime |
1170 | /// of the group name (the index). |
1171 | /// |
1172 | /// The text can't outlive the `Captures` object if this method is |
1173 | /// used, because of how `Index` is defined (normally `a[i]` is part |
1174 | /// of `a` and can't outlive it); to do that, use `name` instead. |
1175 | /// |
1176 | /// # Panics |
1177 | /// |
1178 | /// If there is no group named by the given value. |
1179 | impl<'t, 'i> Index<&'i str> for Captures<'t> { |
1180 | type Output = [u8]; |
1181 | |
1182 | fn index<'a>(&'a self, name: &'i str) -> &'a [u8] { |
1183 | self.name(name) |
1184 | .map(|m: Match<'_>| m.as_bytes()) |
1185 | .unwrap_or_else(|| panic!("no group named ' {}'" , name)) |
1186 | } |
1187 | } |
1188 | |
1189 | /// An iterator that yields all capturing matches in the order in which they |
1190 | /// appear in the regex. |
1191 | /// |
1192 | /// If a particular capture group didn't participate in the match, then `None` |
1193 | /// is yielded for that capture. The first match always corresponds to the |
1194 | /// overall match of the regex. |
1195 | /// |
1196 | /// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and |
1197 | /// the lifetime `'t` corresponds to the originally matched text. |
1198 | #[derive (Clone, Debug)] |
1199 | pub struct SubCaptureMatches<'c, 't> { |
1200 | caps: &'c Captures<'t>, |
1201 | it: SubCapturesPosIter<'c>, |
1202 | } |
1203 | |
1204 | impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { |
1205 | type Item = Option<Match<'t>>; |
1206 | |
1207 | fn next(&mut self) -> Option<Option<Match<'t>>> { |
1208 | self.it |
1209 | .next() |
1210 | .map(|cap: Option<(usize, usize)>| cap.map(|(s: usize, e: usize)| Match::new(self.caps.text, start:s, end:e))) |
1211 | } |
1212 | } |
1213 | |
1214 | impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {} |
1215 | |
1216 | /// Replacer describes types that can be used to replace matches in a byte |
1217 | /// string. |
1218 | /// |
1219 | /// In general, users of this crate shouldn't need to implement this trait, |
1220 | /// since implementations are already provided for `&[u8]` along with other |
1221 | /// variants of bytes types and `FnMut(&Captures) -> Vec<u8>` (or any |
1222 | /// `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`), which covers most use cases. |
1223 | pub trait Replacer { |
1224 | /// Appends text to `dst` to replace the current match. |
1225 | /// |
1226 | /// The current match is represented by `caps`, which is guaranteed to |
1227 | /// have a match at capture group `0`. |
1228 | /// |
1229 | /// For example, a no-op replacement would be |
1230 | /// `dst.extend(&caps[0])`. |
1231 | fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>); |
1232 | |
1233 | /// Return a fixed unchanging replacement byte string. |
1234 | /// |
1235 | /// When doing replacements, if access to `Captures` is not needed (e.g., |
1236 | /// the replacement byte string does not need `$` expansion), then it can |
1237 | /// be beneficial to avoid finding sub-captures. |
1238 | /// |
1239 | /// In general, this is called once for every call to `replacen`. |
1240 | fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { |
1241 | None |
1242 | } |
1243 | |
1244 | /// Return a `Replacer` that borrows and wraps this `Replacer`. |
1245 | /// |
1246 | /// This is useful when you want to take a generic `Replacer` (which might |
1247 | /// not be cloneable) and use it without consuming it, so it can be used |
1248 | /// more than once. |
1249 | /// |
1250 | /// # Example |
1251 | /// |
1252 | /// ``` |
1253 | /// use regex::bytes::{Regex, Replacer}; |
1254 | /// |
1255 | /// fn replace_all_twice<R: Replacer>( |
1256 | /// re: Regex, |
1257 | /// src: &[u8], |
1258 | /// mut rep: R, |
1259 | /// ) -> Vec<u8> { |
1260 | /// let dst = re.replace_all(src, rep.by_ref()); |
1261 | /// let dst = re.replace_all(&dst, rep.by_ref()); |
1262 | /// dst.into_owned() |
1263 | /// } |
1264 | /// ``` |
1265 | fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { |
1266 | ReplacerRef(self) |
1267 | } |
1268 | } |
1269 | |
1270 | /// By-reference adaptor for a `Replacer` |
1271 | /// |
1272 | /// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). |
1273 | #[derive (Debug)] |
1274 | pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); |
1275 | |
1276 | impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { |
1277 | fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { |
1278 | self.0.replace_append(caps, dst) |
1279 | } |
1280 | fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { |
1281 | self.0.no_expansion() |
1282 | } |
1283 | } |
1284 | |
1285 | impl<'a> Replacer for &'a [u8] { |
1286 | fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { |
1287 | caps.expand(*self, dst); |
1288 | } |
1289 | |
1290 | fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { |
1291 | no_expansion(self) |
1292 | } |
1293 | } |
1294 | |
1295 | impl<'a> Replacer for &'a Vec<u8> { |
1296 | fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { |
1297 | caps.expand(*self, dst); |
1298 | } |
1299 | |
1300 | fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { |
1301 | no_expansion(self) |
1302 | } |
1303 | } |
1304 | |
1305 | impl Replacer for Vec<u8> { |
1306 | fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { |
1307 | caps.expand(self, dst); |
1308 | } |
1309 | |
1310 | fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { |
1311 | no_expansion(self) |
1312 | } |
1313 | } |
1314 | |
1315 | impl<'a> Replacer for Cow<'a, [u8]> { |
1316 | fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { |
1317 | caps.expand(self.as_ref(), dst); |
1318 | } |
1319 | |
1320 | fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { |
1321 | no_expansion(self) |
1322 | } |
1323 | } |
1324 | |
1325 | impl<'a> Replacer for &'a Cow<'a, [u8]> { |
1326 | fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { |
1327 | caps.expand(self.as_ref(), dst); |
1328 | } |
1329 | |
1330 | fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { |
1331 | no_expansion(self) |
1332 | } |
1333 | } |
1334 | |
1335 | fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<'_, [u8]>> { |
1336 | let s: &[u8] = t.as_ref(); |
1337 | match find_byte(needle:b'$' , haystack:s) { |
1338 | Some(_) => None, |
1339 | None => Some(Cow::Borrowed(s)), |
1340 | } |
1341 | } |
1342 | |
1343 | impl<F, T> Replacer for F |
1344 | where |
1345 | F: FnMut(&Captures<'_>) -> T, |
1346 | T: AsRef<[u8]>, |
1347 | { |
1348 | fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { |
1349 | dst.extend_from_slice((*self)(caps).as_ref()); |
1350 | } |
1351 | } |
1352 | |
1353 | /// `NoExpand` indicates literal byte string replacement. |
1354 | /// |
1355 | /// It can be used with `replace` and `replace_all` to do a literal byte string |
1356 | /// replacement without expanding `$name` to their corresponding capture |
1357 | /// groups. This can be both convenient (to avoid escaping `$`, for example) |
1358 | /// and performant (since capture groups don't need to be found). |
1359 | /// |
1360 | /// `'t` is the lifetime of the literal text. |
1361 | #[derive (Clone, Debug)] |
1362 | pub struct NoExpand<'t>(pub &'t [u8]); |
1363 | |
1364 | impl<'t> Replacer for NoExpand<'t> { |
1365 | fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) { |
1366 | dst.extend_from_slice(self.0); |
1367 | } |
1368 | |
1369 | fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { |
1370 | Some(Cow::Borrowed(self.0)) |
1371 | } |
1372 | } |
1373 | |