1 | /*! |
2 | Defines a high-level intermediate representation for regular expressions. |
3 | */ |
4 | use std::char; |
5 | use std::cmp; |
6 | use std::error; |
7 | use std::fmt; |
8 | use std::result; |
9 | use std::u8; |
10 | |
11 | use crate::ast::Span; |
12 | use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter}; |
13 | use crate::unicode; |
14 | |
15 | pub use crate::hir::visitor::{visit, Visitor}; |
16 | pub use crate::unicode::CaseFoldError; |
17 | |
18 | mod interval; |
19 | pub mod literal; |
20 | pub mod print; |
21 | pub mod translate; |
22 | mod visitor; |
23 | |
24 | /// An error that can occur while translating an `Ast` to a `Hir`. |
25 | #[derive(Clone, Debug, Eq, PartialEq)] |
26 | pub struct Error { |
27 | /// The kind of error. |
28 | kind: ErrorKind, |
29 | /// The original pattern that the translator's Ast was parsed from. Every |
30 | /// span in an error is a valid range into this string. |
31 | pattern: String, |
32 | /// The span of this error, derived from the Ast given to the translator. |
33 | span: Span, |
34 | } |
35 | |
36 | impl Error { |
37 | /// Return the type of this error. |
38 | pub fn kind(&self) -> &ErrorKind { |
39 | &self.kind |
40 | } |
41 | |
42 | /// The original pattern string in which this error occurred. |
43 | /// |
44 | /// Every span reported by this error is reported in terms of this string. |
45 | pub fn pattern(&self) -> &str { |
46 | &self.pattern |
47 | } |
48 | |
49 | /// Return the span at which this error occurred. |
50 | pub fn span(&self) -> &Span { |
51 | &self.span |
52 | } |
53 | } |
54 | |
55 | /// The type of an error that occurred while building an `Hir`. |
56 | #[derive(Clone, Debug, Eq, PartialEq)] |
57 | pub enum ErrorKind { |
58 | /// This error occurs when a Unicode feature is used when Unicode |
59 | /// support is disabled. For example `(?-u:\pL)` would trigger this error. |
60 | UnicodeNotAllowed, |
61 | /// This error occurs when translating a pattern that could match a byte |
62 | /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled. |
63 | InvalidUtf8, |
64 | /// This occurs when an unrecognized Unicode property name could not |
65 | /// be found. |
66 | UnicodePropertyNotFound, |
67 | /// This occurs when an unrecognized Unicode property value could not |
68 | /// be found. |
69 | UnicodePropertyValueNotFound, |
70 | /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or |
71 | /// `\d`) could not be found. This can occur when the `unicode-perl` |
72 | /// crate feature is not enabled. |
73 | UnicodePerlClassNotFound, |
74 | /// This occurs when the Unicode simple case mapping tables are not |
75 | /// available, and the regular expression required Unicode aware case |
76 | /// insensitivity. |
77 | UnicodeCaseUnavailable, |
78 | /// This occurs when the translator attempts to construct a character class |
79 | /// that is empty. |
80 | /// |
81 | /// Note that this restriction in the translator may be removed in the |
82 | /// future. |
83 | EmptyClassNotAllowed, |
84 | /// Hints that destructuring should not be exhaustive. |
85 | /// |
86 | /// This enum may grow additional variants, so this makes sure clients |
87 | /// don't count on exhaustive matching. (Otherwise, adding a new variant |
88 | /// could break existing code.) |
89 | #[doc (hidden)] |
90 | __Nonexhaustive, |
91 | } |
92 | |
93 | impl ErrorKind { |
94 | // TODO: Remove this method entirely on the next breaking semver release. |
95 | #[allow (deprecated)] |
96 | fn description(&self) -> &str { |
97 | use self::ErrorKind::*; |
98 | match *self { |
99 | UnicodeNotAllowed => "Unicode not allowed here" , |
100 | InvalidUtf8 => "pattern can match invalid UTF-8" , |
101 | UnicodePropertyNotFound => "Unicode property not found" , |
102 | UnicodePropertyValueNotFound => "Unicode property value not found" , |
103 | UnicodePerlClassNotFound => { |
104 | "Unicode-aware Perl class not found \ |
105 | (make sure the unicode-perl feature is enabled)" |
106 | } |
107 | UnicodeCaseUnavailable => { |
108 | "Unicode-aware case insensitivity matching is not available \ |
109 | (make sure the unicode-case feature is enabled)" |
110 | } |
111 | EmptyClassNotAllowed => "empty character classes are not allowed" , |
112 | __Nonexhaustive => unreachable!(), |
113 | } |
114 | } |
115 | } |
116 | |
117 | impl error::Error for Error { |
118 | // TODO: Remove this method entirely on the next breaking semver release. |
119 | #[allow (deprecated)] |
120 | fn description(&self) -> &str { |
121 | self.kind.description() |
122 | } |
123 | } |
124 | |
125 | impl fmt::Display for Error { |
126 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
127 | crate::error::Formatter::from(self).fmt(f) |
128 | } |
129 | } |
130 | |
131 | impl fmt::Display for ErrorKind { |
132 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
133 | // TODO: Remove this on the next breaking semver release. |
134 | #[allow (deprecated)] |
135 | f.write_str(self.description()) |
136 | } |
137 | } |
138 | |
139 | /// A high-level intermediate representation (HIR) for a regular expression. |
140 | /// |
141 | /// The HIR of a regular expression represents an intermediate step between its |
142 | /// abstract syntax (a structured description of the concrete syntax) and |
143 | /// compiled byte codes. The purpose of HIR is to make regular expressions |
144 | /// easier to analyze. In particular, the AST is much more complex than the |
145 | /// HIR. For example, while an AST supports arbitrarily nested character |
146 | /// classes, the HIR will flatten all nested classes into a single set. The HIR |
147 | /// will also "compile away" every flag present in the concrete syntax. For |
148 | /// example, users of HIR expressions never need to worry about case folding; |
149 | /// it is handled automatically by the translator (e.g., by translating `(?i)A` |
150 | /// to `[aA]`). |
151 | /// |
152 | /// If the HIR was produced by a translator that disallows invalid UTF-8, then |
153 | /// the HIR is guaranteed to match UTF-8 exclusively. |
154 | /// |
155 | /// This type defines its own destructor that uses constant stack space and |
156 | /// heap space proportional to the size of the HIR. |
157 | /// |
158 | /// The specific type of an HIR expression can be accessed via its `kind` |
159 | /// or `into_kind` methods. This extra level of indirection exists for two |
160 | /// reasons: |
161 | /// |
162 | /// 1. Construction of an HIR expression *must* use the constructor methods |
163 | /// on this `Hir` type instead of building the `HirKind` values directly. |
164 | /// This permits construction to enforce invariants like "concatenations |
165 | /// always consist of two or more sub-expressions." |
166 | /// 2. Every HIR expression contains attributes that are defined inductively, |
167 | /// and can be computed cheaply during the construction process. For |
168 | /// example, one such attribute is whether the expression must match at the |
169 | /// beginning of the text. |
170 | /// |
171 | /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular |
172 | /// expression pattern string, and uses constant stack space and heap space |
173 | /// proportional to the size of the `Hir`. |
174 | #[derive(Clone, Debug, Eq, PartialEq)] |
175 | pub struct Hir { |
176 | /// The underlying HIR kind. |
177 | kind: HirKind, |
178 | /// Analysis info about this HIR, computed during construction. |
179 | info: HirInfo, |
180 | } |
181 | |
182 | /// The kind of an arbitrary `Hir` expression. |
183 | #[derive(Clone, Debug, Eq, PartialEq)] |
184 | pub enum HirKind { |
185 | /// The empty regular expression, which matches everything, including the |
186 | /// empty string. |
187 | Empty, |
188 | /// A single literal character that matches exactly this character. |
189 | Literal(Literal), |
190 | /// A single character class that matches any of the characters in the |
191 | /// class. A class can either consist of Unicode scalar values as |
192 | /// characters, or it can use bytes. |
193 | Class(Class), |
194 | /// An anchor assertion. An anchor assertion match always has zero length. |
195 | Anchor(Anchor), |
196 | /// A word boundary assertion, which may or may not be Unicode aware. A |
197 | /// word boundary assertion match always has zero length. |
198 | WordBoundary(WordBoundary), |
199 | /// A repetition operation applied to a child expression. |
200 | Repetition(Repetition), |
201 | /// A possibly capturing group, which contains a child expression. |
202 | Group(Group), |
203 | /// A concatenation of expressions. A concatenation always has at least two |
204 | /// child expressions. |
205 | /// |
206 | /// A concatenation matches only if each of its child expression matches |
207 | /// one after the other. |
208 | Concat(Vec<Hir>), |
209 | /// An alternation of expressions. An alternation always has at least two |
210 | /// child expressions. |
211 | /// |
212 | /// An alternation matches only if at least one of its child expression |
213 | /// matches. If multiple expressions match, then the leftmost is preferred. |
214 | Alternation(Vec<Hir>), |
215 | } |
216 | |
217 | impl Hir { |
218 | /// Returns a reference to the underlying HIR kind. |
219 | pub fn kind(&self) -> &HirKind { |
220 | &self.kind |
221 | } |
222 | |
223 | /// Consumes ownership of this HIR expression and returns its underlying |
224 | /// `HirKind`. |
225 | pub fn into_kind(mut self) -> HirKind { |
226 | use std::mem; |
227 | mem::replace(&mut self.kind, HirKind::Empty) |
228 | } |
229 | |
230 | /// Returns an empty HIR expression. |
231 | /// |
232 | /// An empty HIR expression always matches, including the empty string. |
233 | pub fn empty() -> Hir { |
234 | let mut info = HirInfo::new(); |
235 | info.set_always_utf8(true); |
236 | info.set_all_assertions(true); |
237 | info.set_anchored_start(false); |
238 | info.set_anchored_end(false); |
239 | info.set_line_anchored_start(false); |
240 | info.set_line_anchored_end(false); |
241 | info.set_any_anchored_start(false); |
242 | info.set_any_anchored_end(false); |
243 | info.set_match_empty(true); |
244 | info.set_literal(false); |
245 | info.set_alternation_literal(false); |
246 | Hir { kind: HirKind::Empty, info } |
247 | } |
248 | |
249 | /// Creates a literal HIR expression. |
250 | /// |
251 | /// If the given literal has a `Byte` variant with an ASCII byte, then this |
252 | /// method panics. This enforces the invariant that `Byte` variants are |
253 | /// only used to express matching of invalid UTF-8. |
254 | pub fn literal(lit: Literal) -> Hir { |
255 | if let Literal::Byte(b) = lit { |
256 | assert!(b > 0x7F); |
257 | } |
258 | |
259 | let mut info = HirInfo::new(); |
260 | info.set_always_utf8(lit.is_unicode()); |
261 | info.set_all_assertions(false); |
262 | info.set_anchored_start(false); |
263 | info.set_anchored_end(false); |
264 | info.set_line_anchored_start(false); |
265 | info.set_line_anchored_end(false); |
266 | info.set_any_anchored_start(false); |
267 | info.set_any_anchored_end(false); |
268 | info.set_match_empty(false); |
269 | info.set_literal(true); |
270 | info.set_alternation_literal(true); |
271 | Hir { kind: HirKind::Literal(lit), info } |
272 | } |
273 | |
274 | /// Creates a class HIR expression. |
275 | pub fn class(class: Class) -> Hir { |
276 | let mut info = HirInfo::new(); |
277 | info.set_always_utf8(class.is_always_utf8()); |
278 | info.set_all_assertions(false); |
279 | info.set_anchored_start(false); |
280 | info.set_anchored_end(false); |
281 | info.set_line_anchored_start(false); |
282 | info.set_line_anchored_end(false); |
283 | info.set_any_anchored_start(false); |
284 | info.set_any_anchored_end(false); |
285 | info.set_match_empty(false); |
286 | info.set_literal(false); |
287 | info.set_alternation_literal(false); |
288 | Hir { kind: HirKind::Class(class), info } |
289 | } |
290 | |
291 | /// Creates an anchor assertion HIR expression. |
292 | pub fn anchor(anchor: Anchor) -> Hir { |
293 | let mut info = HirInfo::new(); |
294 | info.set_always_utf8(true); |
295 | info.set_all_assertions(true); |
296 | info.set_anchored_start(false); |
297 | info.set_anchored_end(false); |
298 | info.set_line_anchored_start(false); |
299 | info.set_line_anchored_end(false); |
300 | info.set_any_anchored_start(false); |
301 | info.set_any_anchored_end(false); |
302 | info.set_match_empty(true); |
303 | info.set_literal(false); |
304 | info.set_alternation_literal(false); |
305 | if let Anchor::StartText = anchor { |
306 | info.set_anchored_start(true); |
307 | info.set_line_anchored_start(true); |
308 | info.set_any_anchored_start(true); |
309 | } |
310 | if let Anchor::EndText = anchor { |
311 | info.set_anchored_end(true); |
312 | info.set_line_anchored_end(true); |
313 | info.set_any_anchored_end(true); |
314 | } |
315 | if let Anchor::StartLine = anchor { |
316 | info.set_line_anchored_start(true); |
317 | } |
318 | if let Anchor::EndLine = anchor { |
319 | info.set_line_anchored_end(true); |
320 | } |
321 | Hir { kind: HirKind::Anchor(anchor), info } |
322 | } |
323 | |
324 | /// Creates a word boundary assertion HIR expression. |
325 | pub fn word_boundary(word_boundary: WordBoundary) -> Hir { |
326 | let mut info = HirInfo::new(); |
327 | info.set_always_utf8(true); |
328 | info.set_all_assertions(true); |
329 | info.set_anchored_start(false); |
330 | info.set_anchored_end(false); |
331 | info.set_line_anchored_start(false); |
332 | info.set_line_anchored_end(false); |
333 | info.set_any_anchored_start(false); |
334 | info.set_any_anchored_end(false); |
335 | info.set_literal(false); |
336 | info.set_alternation_literal(false); |
337 | // A negated word boundary matches '', so that's fine. But \b does not |
338 | // match \b, so why do we say it can match the empty string? Well, |
339 | // because, if you search for \b against 'a', it will report [0, 0) and |
340 | // [1, 1) as matches, and both of those matches correspond to the empty |
341 | // string. Thus, only *certain* empty strings match \b, which similarly |
342 | // applies to \B. |
343 | info.set_match_empty(true); |
344 | // Negated ASCII word boundaries can match invalid UTF-8. |
345 | if let WordBoundary::AsciiNegate = word_boundary { |
346 | info.set_always_utf8(false); |
347 | } |
348 | Hir { kind: HirKind::WordBoundary(word_boundary), info } |
349 | } |
350 | |
351 | /// Creates a repetition HIR expression. |
352 | pub fn repetition(rep: Repetition) -> Hir { |
353 | let mut info = HirInfo::new(); |
354 | info.set_always_utf8(rep.hir.is_always_utf8()); |
355 | info.set_all_assertions(rep.hir.is_all_assertions()); |
356 | // If this operator can match the empty string, then it can never |
357 | // be anchored. |
358 | info.set_anchored_start( |
359 | !rep.is_match_empty() && rep.hir.is_anchored_start(), |
360 | ); |
361 | info.set_anchored_end( |
362 | !rep.is_match_empty() && rep.hir.is_anchored_end(), |
363 | ); |
364 | info.set_line_anchored_start( |
365 | !rep.is_match_empty() && rep.hir.is_anchored_start(), |
366 | ); |
367 | info.set_line_anchored_end( |
368 | !rep.is_match_empty() && rep.hir.is_anchored_end(), |
369 | ); |
370 | info.set_any_anchored_start(rep.hir.is_any_anchored_start()); |
371 | info.set_any_anchored_end(rep.hir.is_any_anchored_end()); |
372 | info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); |
373 | info.set_literal(false); |
374 | info.set_alternation_literal(false); |
375 | Hir { kind: HirKind::Repetition(rep), info } |
376 | } |
377 | |
378 | /// Creates a group HIR expression. |
379 | pub fn group(group: Group) -> Hir { |
380 | let mut info = HirInfo::new(); |
381 | info.set_always_utf8(group.hir.is_always_utf8()); |
382 | info.set_all_assertions(group.hir.is_all_assertions()); |
383 | info.set_anchored_start(group.hir.is_anchored_start()); |
384 | info.set_anchored_end(group.hir.is_anchored_end()); |
385 | info.set_line_anchored_start(group.hir.is_line_anchored_start()); |
386 | info.set_line_anchored_end(group.hir.is_line_anchored_end()); |
387 | info.set_any_anchored_start(group.hir.is_any_anchored_start()); |
388 | info.set_any_anchored_end(group.hir.is_any_anchored_end()); |
389 | info.set_match_empty(group.hir.is_match_empty()); |
390 | info.set_literal(false); |
391 | info.set_alternation_literal(false); |
392 | Hir { kind: HirKind::Group(group), info } |
393 | } |
394 | |
395 | /// Returns the concatenation of the given expressions. |
396 | /// |
397 | /// This flattens the concatenation as appropriate. |
398 | pub fn concat(mut exprs: Vec<Hir>) -> Hir { |
399 | match exprs.len() { |
400 | 0 => Hir::empty(), |
401 | 1 => exprs.pop().unwrap(), |
402 | _ => { |
403 | let mut info = HirInfo::new(); |
404 | info.set_always_utf8(true); |
405 | info.set_all_assertions(true); |
406 | info.set_any_anchored_start(false); |
407 | info.set_any_anchored_end(false); |
408 | info.set_match_empty(true); |
409 | info.set_literal(true); |
410 | info.set_alternation_literal(true); |
411 | |
412 | // Some attributes require analyzing all sub-expressions. |
413 | for e in &exprs { |
414 | let x = info.is_always_utf8() && e.is_always_utf8(); |
415 | info.set_always_utf8(x); |
416 | |
417 | let x = info.is_all_assertions() && e.is_all_assertions(); |
418 | info.set_all_assertions(x); |
419 | |
420 | let x = info.is_any_anchored_start() |
421 | || e.is_any_anchored_start(); |
422 | info.set_any_anchored_start(x); |
423 | |
424 | let x = |
425 | info.is_any_anchored_end() || e.is_any_anchored_end(); |
426 | info.set_any_anchored_end(x); |
427 | |
428 | let x = info.is_match_empty() && e.is_match_empty(); |
429 | info.set_match_empty(x); |
430 | |
431 | let x = info.is_literal() && e.is_literal(); |
432 | info.set_literal(x); |
433 | |
434 | let x = info.is_alternation_literal() |
435 | && e.is_alternation_literal(); |
436 | info.set_alternation_literal(x); |
437 | } |
438 | // Anchored attributes require something slightly more |
439 | // sophisticated. Normally, WLOG, to determine whether an |
440 | // expression is anchored to the start, we'd only need to check |
441 | // the first expression of a concatenation. However, |
442 | // expressions like `$\b^` are still anchored to the start, |
443 | // but the first expression in the concatenation *isn't* |
444 | // anchored to the start. So the "first" expression to look at |
445 | // is actually one that is either not an assertion or is |
446 | // specifically the StartText assertion. |
447 | info.set_anchored_start( |
448 | exprs |
449 | .iter() |
450 | .take_while(|e| { |
451 | e.is_anchored_start() || e.is_all_assertions() |
452 | }) |
453 | .any(|e| e.is_anchored_start()), |
454 | ); |
455 | // Similarly for the end anchor, but in reverse. |
456 | info.set_anchored_end( |
457 | exprs |
458 | .iter() |
459 | .rev() |
460 | .take_while(|e| { |
461 | e.is_anchored_end() || e.is_all_assertions() |
462 | }) |
463 | .any(|e| e.is_anchored_end()), |
464 | ); |
465 | // Repeat the process for line anchors. |
466 | info.set_line_anchored_start( |
467 | exprs |
468 | .iter() |
469 | .take_while(|e| { |
470 | e.is_line_anchored_start() || e.is_all_assertions() |
471 | }) |
472 | .any(|e| e.is_line_anchored_start()), |
473 | ); |
474 | info.set_line_anchored_end( |
475 | exprs |
476 | .iter() |
477 | .rev() |
478 | .take_while(|e| { |
479 | e.is_line_anchored_end() || e.is_all_assertions() |
480 | }) |
481 | .any(|e| e.is_line_anchored_end()), |
482 | ); |
483 | Hir { kind: HirKind::Concat(exprs), info } |
484 | } |
485 | } |
486 | } |
487 | |
488 | /// Returns the alternation of the given expressions. |
489 | /// |
490 | /// This flattens the alternation as appropriate. |
491 | pub fn alternation(mut exprs: Vec<Hir>) -> Hir { |
492 | match exprs.len() { |
493 | 0 => Hir::empty(), |
494 | 1 => exprs.pop().unwrap(), |
495 | _ => { |
496 | let mut info = HirInfo::new(); |
497 | info.set_always_utf8(true); |
498 | info.set_all_assertions(true); |
499 | info.set_anchored_start(true); |
500 | info.set_anchored_end(true); |
501 | info.set_line_anchored_start(true); |
502 | info.set_line_anchored_end(true); |
503 | info.set_any_anchored_start(false); |
504 | info.set_any_anchored_end(false); |
505 | info.set_match_empty(false); |
506 | info.set_literal(false); |
507 | info.set_alternation_literal(true); |
508 | |
509 | // Some attributes require analyzing all sub-expressions. |
510 | for e in &exprs { |
511 | let x = info.is_always_utf8() && e.is_always_utf8(); |
512 | info.set_always_utf8(x); |
513 | |
514 | let x = info.is_all_assertions() && e.is_all_assertions(); |
515 | info.set_all_assertions(x); |
516 | |
517 | let x = info.is_anchored_start() && e.is_anchored_start(); |
518 | info.set_anchored_start(x); |
519 | |
520 | let x = info.is_anchored_end() && e.is_anchored_end(); |
521 | info.set_anchored_end(x); |
522 | |
523 | let x = info.is_line_anchored_start() |
524 | && e.is_line_anchored_start(); |
525 | info.set_line_anchored_start(x); |
526 | |
527 | let x = info.is_line_anchored_end() |
528 | && e.is_line_anchored_end(); |
529 | info.set_line_anchored_end(x); |
530 | |
531 | let x = info.is_any_anchored_start() |
532 | || e.is_any_anchored_start(); |
533 | info.set_any_anchored_start(x); |
534 | |
535 | let x = |
536 | info.is_any_anchored_end() || e.is_any_anchored_end(); |
537 | info.set_any_anchored_end(x); |
538 | |
539 | let x = info.is_match_empty() || e.is_match_empty(); |
540 | info.set_match_empty(x); |
541 | |
542 | let x = info.is_alternation_literal() && e.is_literal(); |
543 | info.set_alternation_literal(x); |
544 | } |
545 | Hir { kind: HirKind::Alternation(exprs), info } |
546 | } |
547 | } |
548 | } |
549 | |
550 | /// Build an HIR expression for `.`. |
551 | /// |
552 | /// A `.` expression matches any character except for `\n`. To build an |
553 | /// expression that matches any character, including `\n`, use the `any` |
554 | /// method. |
555 | /// |
556 | /// If `bytes` is `true`, then this assumes characters are limited to a |
557 | /// single byte. |
558 | pub fn dot(bytes: bool) -> Hir { |
559 | if bytes { |
560 | let mut cls = ClassBytes::empty(); |
561 | cls.push(ClassBytesRange::new(b' \0' , b' \x09' )); |
562 | cls.push(ClassBytesRange::new(b' \x0B' , b' \xFF' )); |
563 | Hir::class(Class::Bytes(cls)) |
564 | } else { |
565 | let mut cls = ClassUnicode::empty(); |
566 | cls.push(ClassUnicodeRange::new(' \0' , ' \x09' )); |
567 | cls.push(ClassUnicodeRange::new(' \x0B' , ' \u{10FFFF}' )); |
568 | Hir::class(Class::Unicode(cls)) |
569 | } |
570 | } |
571 | |
572 | /// Build an HIR expression for `(?s).`. |
573 | /// |
574 | /// A `(?s).` expression matches any character, including `\n`. To build an |
575 | /// expression that matches any character except for `\n`, then use the |
576 | /// `dot` method. |
577 | /// |
578 | /// If `bytes` is `true`, then this assumes characters are limited to a |
579 | /// single byte. |
580 | pub fn any(bytes: bool) -> Hir { |
581 | if bytes { |
582 | let mut cls = ClassBytes::empty(); |
583 | cls.push(ClassBytesRange::new(b' \0' , b' \xFF' )); |
584 | Hir::class(Class::Bytes(cls)) |
585 | } else { |
586 | let mut cls = ClassUnicode::empty(); |
587 | cls.push(ClassUnicodeRange::new(' \0' , ' \u{10FFFF}' )); |
588 | Hir::class(Class::Unicode(cls)) |
589 | } |
590 | } |
591 | |
592 | /// Return true if and only if this HIR will always match valid UTF-8. |
593 | /// |
594 | /// When this returns false, then it is possible for this HIR expression |
595 | /// to match invalid UTF-8. |
596 | pub fn is_always_utf8(&self) -> bool { |
597 | self.info.is_always_utf8() |
598 | } |
599 | |
600 | /// Returns true if and only if this entire HIR expression is made up of |
601 | /// zero-width assertions. |
602 | /// |
603 | /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but |
604 | /// not `^a`. |
605 | pub fn is_all_assertions(&self) -> bool { |
606 | self.info.is_all_assertions() |
607 | } |
608 | |
609 | /// Return true if and only if this HIR is required to match from the |
610 | /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`, |
611 | /// `^foo|^bar` but not `^foo|bar`. |
612 | pub fn is_anchored_start(&self) -> bool { |
613 | self.info.is_anchored_start() |
614 | } |
615 | |
616 | /// Return true if and only if this HIR is required to match at the end |
617 | /// of text. This includes expressions like `foo$`, `(foo|bar)$`, |
618 | /// `foo$|bar$` but not `foo$|bar`. |
619 | pub fn is_anchored_end(&self) -> bool { |
620 | self.info.is_anchored_end() |
621 | } |
622 | |
623 | /// Return true if and only if this HIR is required to match from the |
624 | /// beginning of text or the beginning of a line. This includes expressions |
625 | /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar` |
626 | /// but not `^foo|bar` or `(?m)^foo|bar`. |
627 | /// |
628 | /// Note that if `is_anchored_start` is `true`, then |
629 | /// `is_line_anchored_start` will also be `true`. The reverse implication |
630 | /// is not true. For example, `(?m)^foo` is line anchored, but not |
631 | /// `is_anchored_start`. |
632 | pub fn is_line_anchored_start(&self) -> bool { |
633 | self.info.is_line_anchored_start() |
634 | } |
635 | |
636 | /// Return true if and only if this HIR is required to match at the |
637 | /// end of text or the end of a line. This includes expressions like |
638 | /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`, |
639 | /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`. |
640 | /// |
641 | /// Note that if `is_anchored_end` is `true`, then |
642 | /// `is_line_anchored_end` will also be `true`. The reverse implication |
643 | /// is not true. For example, `(?m)foo$` is line anchored, but not |
644 | /// `is_anchored_end`. |
645 | pub fn is_line_anchored_end(&self) -> bool { |
646 | self.info.is_line_anchored_end() |
647 | } |
648 | |
649 | /// Return true if and only if this HIR contains any sub-expression that |
650 | /// is required to match at the beginning of text. Specifically, this |
651 | /// returns true if the `^` symbol (when multiline mode is disabled) or the |
652 | /// `\A` escape appear anywhere in the regex. |
653 | pub fn is_any_anchored_start(&self) -> bool { |
654 | self.info.is_any_anchored_start() |
655 | } |
656 | |
657 | /// Return true if and only if this HIR contains any sub-expression that is |
658 | /// required to match at the end of text. Specifically, this returns true |
659 | /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape |
660 | /// appear anywhere in the regex. |
661 | pub fn is_any_anchored_end(&self) -> bool { |
662 | self.info.is_any_anchored_end() |
663 | } |
664 | |
665 | /// Return true if and only if the empty string is part of the language |
666 | /// matched by this regular expression. |
667 | /// |
668 | /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b` |
669 | /// and `\B`, but not `a` or `a+`. |
670 | pub fn is_match_empty(&self) -> bool { |
671 | self.info.is_match_empty() |
672 | } |
673 | |
674 | /// Return true if and only if this HIR is a simple literal. This is only |
675 | /// true when this HIR expression is either itself a `Literal` or a |
676 | /// concatenation of only `Literal`s. |
677 | /// |
678 | /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`, |
679 | /// `` are not (even though that contain sub-expressions that are literals). |
680 | pub fn is_literal(&self) -> bool { |
681 | self.info.is_literal() |
682 | } |
683 | |
684 | /// Return true if and only if this HIR is either a simple literal or an |
685 | /// alternation of simple literals. This is only |
686 | /// true when this HIR expression is either itself a `Literal` or a |
687 | /// concatenation of only `Literal`s or an alternation of only `Literal`s. |
688 | /// |
689 | /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation |
690 | /// literals, but `f+`, `(foo)`, `foo()`, `` |
691 | /// are not (even though that contain sub-expressions that are literals). |
692 | pub fn is_alternation_literal(&self) -> bool { |
693 | self.info.is_alternation_literal() |
694 | } |
695 | } |
696 | |
697 | impl HirKind { |
698 | /// Return true if and only if this HIR is the empty regular expression. |
699 | /// |
700 | /// Note that this is not defined inductively. That is, it only tests if |
701 | /// this kind is the `Empty` variant. To get the inductive definition, |
702 | /// use the `is_match_empty` method on [`Hir`](struct.Hir.html). |
703 | pub fn is_empty(&self) -> bool { |
704 | match *self { |
705 | HirKind::Empty => true, |
706 | _ => false, |
707 | } |
708 | } |
709 | |
710 | /// Returns true if and only if this kind has any (including possibly |
711 | /// empty) subexpressions. |
712 | pub fn has_subexprs(&self) -> bool { |
713 | match *self { |
714 | HirKind::Empty |
715 | | HirKind::Literal(_) |
716 | | HirKind::Class(_) |
717 | | HirKind::Anchor(_) |
718 | | HirKind::WordBoundary(_) => false, |
719 | HirKind::Group(_) |
720 | | HirKind::Repetition(_) |
721 | | HirKind::Concat(_) |
722 | | HirKind::Alternation(_) => true, |
723 | } |
724 | } |
725 | } |
726 | |
727 | /// Print a display representation of this Hir. |
728 | /// |
729 | /// The result of this is a valid regular expression pattern string. |
730 | /// |
731 | /// This implementation uses constant stack space and heap space proportional |
732 | /// to the size of the `Hir`. |
733 | impl fmt::Display for Hir { |
734 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
735 | use crate::hir::print::Printer; |
736 | Printer::new().print(self, f) |
737 | } |
738 | } |
739 | |
740 | /// The high-level intermediate representation of a literal. |
741 | /// |
742 | /// A literal corresponds to a single character, where a character is either |
743 | /// defined by a Unicode scalar value or an arbitrary byte. Unicode characters |
744 | /// are preferred whenever possible. In particular, a `Byte` variant is only |
745 | /// ever produced when it could match invalid UTF-8. |
746 | #[derive(Clone, Debug, Eq, PartialEq)] |
747 | pub enum Literal { |
748 | /// A single character represented by a Unicode scalar value. |
749 | Unicode(char), |
750 | /// A single character represented by an arbitrary byte. |
751 | Byte(u8), |
752 | } |
753 | |
754 | impl Literal { |
755 | /// Returns true if and only if this literal corresponds to a Unicode |
756 | /// scalar value. |
757 | pub fn is_unicode(&self) -> bool { |
758 | match *self { |
759 | Literal::Unicode(_) => true, |
760 | Literal::Byte(b) if b <= 0x7F => true, |
761 | Literal::Byte(_) => false, |
762 | } |
763 | } |
764 | } |
765 | |
766 | /// The high-level intermediate representation of a character class. |
767 | /// |
768 | /// A character class corresponds to a set of characters. A character is either |
769 | /// defined by a Unicode scalar value or a byte. Unicode characters are used |
770 | /// by default, while bytes are used when Unicode mode (via the `u` flag) is |
771 | /// disabled. |
772 | /// |
773 | /// A character class, regardless of its character type, is represented by a |
774 | /// sequence of non-overlapping non-adjacent ranges of characters. |
775 | /// |
776 | /// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may |
777 | /// be produced even when it exclusively matches valid UTF-8. This is because |
778 | /// a `Bytes` variant represents an intention by the author of the regular |
779 | /// expression to disable Unicode mode, which in turn impacts the semantics of |
780 | /// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not |
781 | /// match the same set of strings. |
782 | #[derive(Clone, Debug, Eq, PartialEq)] |
783 | pub enum Class { |
784 | /// A set of characters represented by Unicode scalar values. |
785 | Unicode(ClassUnicode), |
786 | /// A set of characters represented by arbitrary bytes (one byte per |
787 | /// character). |
788 | Bytes(ClassBytes), |
789 | } |
790 | |
791 | impl Class { |
792 | /// Apply Unicode simple case folding to this character class, in place. |
793 | /// The character class will be expanded to include all simple case folded |
794 | /// character variants. |
795 | /// |
796 | /// If this is a byte oriented character class, then this will be limited |
797 | /// to the ASCII ranges `A-Z` and `a-z`. |
798 | pub fn case_fold_simple(&mut self) { |
799 | match *self { |
800 | Class::Unicode(ref mut x) => x.case_fold_simple(), |
801 | Class::Bytes(ref mut x) => x.case_fold_simple(), |
802 | } |
803 | } |
804 | |
805 | /// Negate this character class in place. |
806 | /// |
807 | /// After completion, this character class will contain precisely the |
808 | /// characters that weren't previously in the class. |
809 | pub fn negate(&mut self) { |
810 | match *self { |
811 | Class::Unicode(ref mut x) => x.negate(), |
812 | Class::Bytes(ref mut x) => x.negate(), |
813 | } |
814 | } |
815 | |
816 | /// Returns true if and only if this character class will only ever match |
817 | /// valid UTF-8. |
818 | /// |
819 | /// A character class can match invalid UTF-8 only when the following |
820 | /// conditions are met: |
821 | /// |
822 | /// 1. The translator was configured to permit generating an expression |
823 | /// that can match invalid UTF-8. (By default, this is disabled.) |
824 | /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete |
825 | /// syntax or in the parser builder. By default, Unicode mode is |
826 | /// enabled. |
827 | pub fn is_always_utf8(&self) -> bool { |
828 | match *self { |
829 | Class::Unicode(_) => true, |
830 | Class::Bytes(ref x) => x.is_all_ascii(), |
831 | } |
832 | } |
833 | } |
834 | |
835 | /// A set of characters represented by Unicode scalar values. |
836 | #[derive(Clone, Debug, Eq, PartialEq)] |
837 | pub struct ClassUnicode { |
838 | set: IntervalSet<ClassUnicodeRange>, |
839 | } |
840 | |
841 | impl ClassUnicode { |
842 | /// Create a new class from a sequence of ranges. |
843 | /// |
844 | /// The given ranges do not need to be in any specific order, and ranges |
845 | /// may overlap. |
846 | pub fn new<I>(ranges: I) -> ClassUnicode |
847 | where |
848 | I: IntoIterator<Item = ClassUnicodeRange>, |
849 | { |
850 | ClassUnicode { set: IntervalSet::new(ranges) } |
851 | } |
852 | |
853 | /// Create a new class with no ranges. |
854 | pub fn empty() -> ClassUnicode { |
855 | ClassUnicode::new(vec![]) |
856 | } |
857 | |
858 | /// Add a new range to this set. |
859 | pub fn push(&mut self, range: ClassUnicodeRange) { |
860 | self.set.push(range); |
861 | } |
862 | |
863 | /// Return an iterator over all ranges in this class. |
864 | /// |
865 | /// The iterator yields ranges in ascending order. |
866 | pub fn iter(&self) -> ClassUnicodeIter<'_> { |
867 | ClassUnicodeIter(self.set.iter()) |
868 | } |
869 | |
870 | /// Return the underlying ranges as a slice. |
871 | pub fn ranges(&self) -> &[ClassUnicodeRange] { |
872 | self.set.intervals() |
873 | } |
874 | |
875 | /// Expand this character class such that it contains all case folded |
876 | /// characters, according to Unicode's "simple" mapping. For example, if |
877 | /// this class consists of the range `a-z`, then applying case folding will |
878 | /// result in the class containing both the ranges `a-z` and `A-Z`. |
879 | /// |
880 | /// # Panics |
881 | /// |
882 | /// This routine panics when the case mapping data necessary for this |
883 | /// routine to complete is unavailable. This occurs when the `unicode-case` |
884 | /// feature is not enabled. |
885 | /// |
886 | /// Callers should prefer using `try_case_fold_simple` instead, which will |
887 | /// return an error instead of panicking. |
888 | pub fn case_fold_simple(&mut self) { |
889 | self.set |
890 | .case_fold_simple() |
891 | .expect("unicode-case feature must be enabled" ); |
892 | } |
893 | |
894 | /// Expand this character class such that it contains all case folded |
895 | /// characters, according to Unicode's "simple" mapping. For example, if |
896 | /// this class consists of the range `a-z`, then applying case folding will |
897 | /// result in the class containing both the ranges `a-z` and `A-Z`. |
898 | /// |
899 | /// # Error |
900 | /// |
901 | /// This routine returns an error when the case mapping data necessary |
902 | /// for this routine to complete is unavailable. This occurs when the |
903 | /// `unicode-case` feature is not enabled. |
904 | pub fn try_case_fold_simple( |
905 | &mut self, |
906 | ) -> result::Result<(), CaseFoldError> { |
907 | self.set.case_fold_simple() |
908 | } |
909 | |
910 | /// Negate this character class. |
911 | /// |
912 | /// For all `c` where `c` is a Unicode scalar value, if `c` was in this |
913 | /// set, then it will not be in this set after negation. |
914 | pub fn negate(&mut self) { |
915 | self.set.negate(); |
916 | } |
917 | |
918 | /// Union this character class with the given character class, in place. |
919 | pub fn union(&mut self, other: &ClassUnicode) { |
920 | self.set.union(&other.set); |
921 | } |
922 | |
923 | /// Intersect this character class with the given character class, in |
924 | /// place. |
925 | pub fn intersect(&mut self, other: &ClassUnicode) { |
926 | self.set.intersect(&other.set); |
927 | } |
928 | |
929 | /// Subtract the given character class from this character class, in place. |
930 | pub fn difference(&mut self, other: &ClassUnicode) { |
931 | self.set.difference(&other.set); |
932 | } |
933 | |
934 | /// Compute the symmetric difference of the given character classes, in |
935 | /// place. |
936 | /// |
937 | /// This computes the symmetric difference of two character classes. This |
938 | /// removes all elements in this class that are also in the given class, |
939 | /// but all adds all elements from the given class that aren't in this |
940 | /// class. That is, the class will contain all elements in either class, |
941 | /// but will not contain any elements that are in both classes. |
942 | pub fn symmetric_difference(&mut self, other: &ClassUnicode) { |
943 | self.set.symmetric_difference(&other.set); |
944 | } |
945 | |
946 | /// Returns true if and only if this character class will either match |
947 | /// nothing or only ASCII bytes. Stated differently, this returns false |
948 | /// if and only if this class contains a non-ASCII codepoint. |
949 | pub fn is_all_ascii(&self) -> bool { |
950 | self.set.intervals().last().map_or(true, |r| r.end <= ' \x7F' ) |
951 | } |
952 | } |
953 | |
954 | /// An iterator over all ranges in a Unicode character class. |
955 | /// |
956 | /// The lifetime `'a` refers to the lifetime of the underlying class. |
957 | #[derive(Debug)] |
958 | pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>); |
959 | |
960 | impl<'a> Iterator for ClassUnicodeIter<'a> { |
961 | type Item = &'a ClassUnicodeRange; |
962 | |
963 | fn next(&mut self) -> Option<&'a ClassUnicodeRange> { |
964 | self.0.next() |
965 | } |
966 | } |
967 | |
968 | /// A single range of characters represented by Unicode scalar values. |
969 | /// |
970 | /// The range is closed. That is, the start and end of the range are included |
971 | /// in the range. |
972 | #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] |
973 | pub struct ClassUnicodeRange { |
974 | start: char, |
975 | end: char, |
976 | } |
977 | |
978 | impl fmt::Debug for ClassUnicodeRange { |
979 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
980 | let start = if !self.start.is_whitespace() && !self.start.is_control() |
981 | { |
982 | self.start.to_string() |
983 | } else { |
984 | format!("0x{:X}" , self.start as u32) |
985 | }; |
986 | let end = if !self.end.is_whitespace() && !self.end.is_control() { |
987 | self.end.to_string() |
988 | } else { |
989 | format!("0x{:X}" , self.end as u32) |
990 | }; |
991 | f.debug_struct("ClassUnicodeRange" ) |
992 | .field("start" , &start) |
993 | .field("end" , &end) |
994 | .finish() |
995 | } |
996 | } |
997 | |
998 | impl Interval for ClassUnicodeRange { |
999 | type Bound = char; |
1000 | |
1001 | #[inline ] |
1002 | fn lower(&self) -> char { |
1003 | self.start |
1004 | } |
1005 | #[inline ] |
1006 | fn upper(&self) -> char { |
1007 | self.end |
1008 | } |
1009 | #[inline ] |
1010 | fn set_lower(&mut self, bound: char) { |
1011 | self.start = bound; |
1012 | } |
1013 | #[inline ] |
1014 | fn set_upper(&mut self, bound: char) { |
1015 | self.end = bound; |
1016 | } |
1017 | |
1018 | /// Apply simple case folding to this Unicode scalar value range. |
1019 | /// |
1020 | /// Additional ranges are appended to the given vector. Canonical ordering |
1021 | /// is *not* maintained in the given vector. |
1022 | fn case_fold_simple( |
1023 | &self, |
1024 | ranges: &mut Vec<ClassUnicodeRange>, |
1025 | ) -> Result<(), unicode::CaseFoldError> { |
1026 | if !unicode::contains_simple_case_mapping(self.start, self.end)? { |
1027 | return Ok(()); |
1028 | } |
1029 | let start = self.start as u32; |
1030 | let end = (self.end as u32).saturating_add(1); |
1031 | let mut next_simple_cp = None; |
1032 | for cp in (start..end).filter_map(char::from_u32) { |
1033 | if next_simple_cp.map_or(false, |next| cp < next) { |
1034 | continue; |
1035 | } |
1036 | let it = match unicode::simple_fold(cp)? { |
1037 | Ok(it) => it, |
1038 | Err(next) => { |
1039 | next_simple_cp = next; |
1040 | continue; |
1041 | } |
1042 | }; |
1043 | for cp_folded in it { |
1044 | ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); |
1045 | } |
1046 | } |
1047 | Ok(()) |
1048 | } |
1049 | } |
1050 | |
1051 | impl ClassUnicodeRange { |
1052 | /// Create a new Unicode scalar value range for a character class. |
1053 | /// |
1054 | /// The returned range is always in a canonical form. That is, the range |
1055 | /// returned always satisfies the invariant that `start <= end`. |
1056 | pub fn new(start: char, end: char) -> ClassUnicodeRange { |
1057 | ClassUnicodeRange::create(start, end) |
1058 | } |
1059 | |
1060 | /// Return the start of this range. |
1061 | /// |
1062 | /// The start of a range is always less than or equal to the end of the |
1063 | /// range. |
1064 | pub fn start(&self) -> char { |
1065 | self.start |
1066 | } |
1067 | |
1068 | /// Return the end of this range. |
1069 | /// |
1070 | /// The end of a range is always greater than or equal to the start of the |
1071 | /// range. |
1072 | pub fn end(&self) -> char { |
1073 | self.end |
1074 | } |
1075 | } |
1076 | |
1077 | /// A set of characters represented by arbitrary bytes (where one byte |
1078 | /// corresponds to one character). |
1079 | #[derive(Clone, Debug, Eq, PartialEq)] |
1080 | pub struct ClassBytes { |
1081 | set: IntervalSet<ClassBytesRange>, |
1082 | } |
1083 | |
1084 | impl ClassBytes { |
1085 | /// Create a new class from a sequence of ranges. |
1086 | /// |
1087 | /// The given ranges do not need to be in any specific order, and ranges |
1088 | /// may overlap. |
1089 | pub fn new<I>(ranges: I) -> ClassBytes |
1090 | where |
1091 | I: IntoIterator<Item = ClassBytesRange>, |
1092 | { |
1093 | ClassBytes { set: IntervalSet::new(ranges) } |
1094 | } |
1095 | |
1096 | /// Create a new class with no ranges. |
1097 | pub fn empty() -> ClassBytes { |
1098 | ClassBytes::new(vec![]) |
1099 | } |
1100 | |
1101 | /// Add a new range to this set. |
1102 | pub fn push(&mut self, range: ClassBytesRange) { |
1103 | self.set.push(range); |
1104 | } |
1105 | |
1106 | /// Return an iterator over all ranges in this class. |
1107 | /// |
1108 | /// The iterator yields ranges in ascending order. |
1109 | pub fn iter(&self) -> ClassBytesIter<'_> { |
1110 | ClassBytesIter(self.set.iter()) |
1111 | } |
1112 | |
1113 | /// Return the underlying ranges as a slice. |
1114 | pub fn ranges(&self) -> &[ClassBytesRange] { |
1115 | self.set.intervals() |
1116 | } |
1117 | |
1118 | /// Expand this character class such that it contains all case folded |
1119 | /// characters. For example, if this class consists of the range `a-z`, |
1120 | /// then applying case folding will result in the class containing both the |
1121 | /// ranges `a-z` and `A-Z`. |
1122 | /// |
1123 | /// Note that this only applies ASCII case folding, which is limited to the |
1124 | /// characters `a-z` and `A-Z`. |
1125 | pub fn case_fold_simple(&mut self) { |
1126 | self.set.case_fold_simple().expect("ASCII case folding never fails" ); |
1127 | } |
1128 | |
1129 | /// Negate this byte class. |
1130 | /// |
1131 | /// For all `b` where `b` is a any byte, if `b` was in this set, then it |
1132 | /// will not be in this set after negation. |
1133 | pub fn negate(&mut self) { |
1134 | self.set.negate(); |
1135 | } |
1136 | |
1137 | /// Union this byte class with the given byte class, in place. |
1138 | pub fn union(&mut self, other: &ClassBytes) { |
1139 | self.set.union(&other.set); |
1140 | } |
1141 | |
1142 | /// Intersect this byte class with the given byte class, in place. |
1143 | pub fn intersect(&mut self, other: &ClassBytes) { |
1144 | self.set.intersect(&other.set); |
1145 | } |
1146 | |
1147 | /// Subtract the given byte class from this byte class, in place. |
1148 | pub fn difference(&mut self, other: &ClassBytes) { |
1149 | self.set.difference(&other.set); |
1150 | } |
1151 | |
1152 | /// Compute the symmetric difference of the given byte classes, in place. |
1153 | /// |
1154 | /// This computes the symmetric difference of two byte classes. This |
1155 | /// removes all elements in this class that are also in the given class, |
1156 | /// but all adds all elements from the given class that aren't in this |
1157 | /// class. That is, the class will contain all elements in either class, |
1158 | /// but will not contain any elements that are in both classes. |
1159 | pub fn symmetric_difference(&mut self, other: &ClassBytes) { |
1160 | self.set.symmetric_difference(&other.set); |
1161 | } |
1162 | |
1163 | /// Returns true if and only if this character class will either match |
1164 | /// nothing or only ASCII bytes. Stated differently, this returns false |
1165 | /// if and only if this class contains a non-ASCII byte. |
1166 | pub fn is_all_ascii(&self) -> bool { |
1167 | self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) |
1168 | } |
1169 | } |
1170 | |
1171 | /// An iterator over all ranges in a byte character class. |
1172 | /// |
1173 | /// The lifetime `'a` refers to the lifetime of the underlying class. |
1174 | #[derive(Debug)] |
1175 | pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); |
1176 | |
1177 | impl<'a> Iterator for ClassBytesIter<'a> { |
1178 | type Item = &'a ClassBytesRange; |
1179 | |
1180 | fn next(&mut self) -> Option<&'a ClassBytesRange> { |
1181 | self.0.next() |
1182 | } |
1183 | } |
1184 | |
1185 | /// A single range of characters represented by arbitrary bytes. |
1186 | /// |
1187 | /// The range is closed. That is, the start and end of the range are included |
1188 | /// in the range. |
1189 | #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] |
1190 | pub struct ClassBytesRange { |
1191 | start: u8, |
1192 | end: u8, |
1193 | } |
1194 | |
1195 | impl Interval for ClassBytesRange { |
1196 | type Bound = u8; |
1197 | |
1198 | #[inline ] |
1199 | fn lower(&self) -> u8 { |
1200 | self.start |
1201 | } |
1202 | #[inline ] |
1203 | fn upper(&self) -> u8 { |
1204 | self.end |
1205 | } |
1206 | #[inline ] |
1207 | fn set_lower(&mut self, bound: u8) { |
1208 | self.start = bound; |
1209 | } |
1210 | #[inline ] |
1211 | fn set_upper(&mut self, bound: u8) { |
1212 | self.end = bound; |
1213 | } |
1214 | |
1215 | /// Apply simple case folding to this byte range. Only ASCII case mappings |
1216 | /// (for a-z) are applied. |
1217 | /// |
1218 | /// Additional ranges are appended to the given vector. Canonical ordering |
1219 | /// is *not* maintained in the given vector. |
1220 | fn case_fold_simple( |
1221 | &self, |
1222 | ranges: &mut Vec<ClassBytesRange>, |
1223 | ) -> Result<(), unicode::CaseFoldError> { |
1224 | if !ClassBytesRange::new(b'a' , b'z' ).is_intersection_empty(self) { |
1225 | let lower = cmp::max(self.start, b'a' ); |
1226 | let upper = cmp::min(self.end, b'z' ); |
1227 | ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); |
1228 | } |
1229 | if !ClassBytesRange::new(b'A' , b'Z' ).is_intersection_empty(self) { |
1230 | let lower = cmp::max(self.start, b'A' ); |
1231 | let upper = cmp::min(self.end, b'Z' ); |
1232 | ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); |
1233 | } |
1234 | Ok(()) |
1235 | } |
1236 | } |
1237 | |
1238 | impl ClassBytesRange { |
1239 | /// Create a new byte range for a character class. |
1240 | /// |
1241 | /// The returned range is always in a canonical form. That is, the range |
1242 | /// returned always satisfies the invariant that `start <= end`. |
1243 | pub fn new(start: u8, end: u8) -> ClassBytesRange { |
1244 | ClassBytesRange::create(start, end) |
1245 | } |
1246 | |
1247 | /// Return the start of this range. |
1248 | /// |
1249 | /// The start of a range is always less than or equal to the end of the |
1250 | /// range. |
1251 | pub fn start(&self) -> u8 { |
1252 | self.start |
1253 | } |
1254 | |
1255 | /// Return the end of this range. |
1256 | /// |
1257 | /// The end of a range is always greater than or equal to the start of the |
1258 | /// range. |
1259 | pub fn end(&self) -> u8 { |
1260 | self.end |
1261 | } |
1262 | } |
1263 | |
1264 | impl fmt::Debug for ClassBytesRange { |
1265 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1266 | let mut debug = f.debug_struct("ClassBytesRange" ); |
1267 | if self.start <= 0x7F { |
1268 | debug.field("start" , &(self.start as char)); |
1269 | } else { |
1270 | debug.field("start" , &self.start); |
1271 | } |
1272 | if self.end <= 0x7F { |
1273 | debug.field("end" , &(self.end as char)); |
1274 | } else { |
1275 | debug.field("end" , &self.end); |
1276 | } |
1277 | debug.finish() |
1278 | } |
1279 | } |
1280 | |
1281 | /// The high-level intermediate representation for an anchor assertion. |
1282 | /// |
1283 | /// A matching anchor assertion is always zero-length. |
1284 | #[derive(Clone, Debug, Eq, PartialEq)] |
1285 | pub enum Anchor { |
1286 | /// Match the beginning of a line or the beginning of text. Specifically, |
1287 | /// this matches at the starting position of the input, or at the position |
1288 | /// immediately following a `\n` character. |
1289 | StartLine, |
1290 | /// Match the end of a line or the end of text. Specifically, |
1291 | /// this matches at the end position of the input, or at the position |
1292 | /// immediately preceding a `\n` character. |
1293 | EndLine, |
1294 | /// Match the beginning of text. Specifically, this matches at the starting |
1295 | /// position of the input. |
1296 | StartText, |
1297 | /// Match the end of text. Specifically, this matches at the ending |
1298 | /// position of the input. |
1299 | EndText, |
1300 | } |
1301 | |
1302 | /// The high-level intermediate representation for a word-boundary assertion. |
1303 | /// |
1304 | /// A matching word boundary assertion is always zero-length. |
1305 | #[derive(Clone, Debug, Eq, PartialEq)] |
1306 | pub enum WordBoundary { |
1307 | /// Match a Unicode-aware word boundary. That is, this matches a position |
1308 | /// where the left adjacent character and right adjacent character |
1309 | /// correspond to a word and non-word or a non-word and word character. |
1310 | Unicode, |
1311 | /// Match a Unicode-aware negation of a word boundary. |
1312 | UnicodeNegate, |
1313 | /// Match an ASCII-only word boundary. That is, this matches a position |
1314 | /// where the left adjacent character and right adjacent character |
1315 | /// correspond to a word and non-word or a non-word and word character. |
1316 | Ascii, |
1317 | /// Match an ASCII-only negation of a word boundary. |
1318 | AsciiNegate, |
1319 | } |
1320 | |
1321 | impl WordBoundary { |
1322 | /// Returns true if and only if this word boundary assertion is negated. |
1323 | pub fn is_negated(&self) -> bool { |
1324 | match *self { |
1325 | WordBoundary::Unicode | WordBoundary::Ascii => false, |
1326 | WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true, |
1327 | } |
1328 | } |
1329 | } |
1330 | |
1331 | /// The high-level intermediate representation for a group. |
1332 | /// |
1333 | /// This represents one of three possible group types: |
1334 | /// |
1335 | /// 1. A non-capturing group (e.g., `(?:expr)`). |
1336 | /// 2. A capturing group (e.g., `(expr)`). |
1337 | /// 3. A named capturing group (e.g., `(?P<name>expr)`). |
1338 | #[derive(Clone, Debug, Eq, PartialEq)] |
1339 | pub struct Group { |
1340 | /// The kind of this group. If it is a capturing group, then the kind |
1341 | /// contains the capture group index (and the name, if it is a named |
1342 | /// group). |
1343 | pub kind: GroupKind, |
1344 | /// The expression inside the capturing group, which may be empty. |
1345 | pub hir: Box<Hir>, |
1346 | } |
1347 | |
1348 | /// The kind of group. |
1349 | #[derive(Clone, Debug, Eq, PartialEq)] |
1350 | pub enum GroupKind { |
1351 | /// A normal unnamed capturing group. |
1352 | /// |
1353 | /// The value is the capture index of the group. |
1354 | CaptureIndex(u32), |
1355 | /// A named capturing group. |
1356 | CaptureName { |
1357 | /// The name of the group. |
1358 | name: String, |
1359 | /// The capture index of the group. |
1360 | index: u32, |
1361 | }, |
1362 | /// A non-capturing group. |
1363 | NonCapturing, |
1364 | } |
1365 | |
1366 | /// The high-level intermediate representation of a repetition operator. |
1367 | /// |
1368 | /// A repetition operator permits the repetition of an arbitrary |
1369 | /// sub-expression. |
1370 | #[derive(Clone, Debug, Eq, PartialEq)] |
1371 | pub struct Repetition { |
1372 | /// The kind of this repetition operator. |
1373 | pub kind: RepetitionKind, |
1374 | /// Whether this repetition operator is greedy or not. A greedy operator |
1375 | /// will match as much as it can. A non-greedy operator will match as |
1376 | /// little as it can. |
1377 | /// |
1378 | /// Typically, operators are greedy by default and are only non-greedy when |
1379 | /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is |
1380 | /// not. However, this can be inverted via the `U` "ungreedy" flag. |
1381 | pub greedy: bool, |
1382 | /// The expression being repeated. |
1383 | pub hir: Box<Hir>, |
1384 | } |
1385 | |
1386 | impl Repetition { |
1387 | /// Returns true if and only if this repetition operator makes it possible |
1388 | /// to match the empty string. |
1389 | /// |
1390 | /// Note that this is not defined inductively. For example, while `a*` |
1391 | /// will report `true`, `()+` will not, even though `()` matches the empty |
1392 | /// string and one or more occurrences of something that matches the empty |
1393 | /// string will always match the empty string. In order to get the |
1394 | /// inductive definition, see the corresponding method on |
1395 | /// [`Hir`](struct.Hir.html). |
1396 | pub fn is_match_empty(&self) -> bool { |
1397 | match self.kind { |
1398 | RepetitionKind::ZeroOrOne => true, |
1399 | RepetitionKind::ZeroOrMore => true, |
1400 | RepetitionKind::OneOrMore => false, |
1401 | RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0, |
1402 | RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0, |
1403 | RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0, |
1404 | } |
1405 | } |
1406 | } |
1407 | |
1408 | /// The kind of a repetition operator. |
1409 | #[derive(Clone, Debug, Eq, PartialEq)] |
1410 | pub enum RepetitionKind { |
1411 | /// Matches a sub-expression zero or one times. |
1412 | ZeroOrOne, |
1413 | /// Matches a sub-expression zero or more times. |
1414 | ZeroOrMore, |
1415 | /// Matches a sub-expression one or more times. |
1416 | OneOrMore, |
1417 | /// Matches a sub-expression within a bounded range of times. |
1418 | Range(RepetitionRange), |
1419 | } |
1420 | |
1421 | /// The kind of a counted repetition operator. |
1422 | #[derive(Clone, Debug, Eq, PartialEq)] |
1423 | pub enum RepetitionRange { |
1424 | /// Matches a sub-expression exactly this many times. |
1425 | Exactly(u32), |
1426 | /// Matches a sub-expression at least this many times. |
1427 | AtLeast(u32), |
1428 | /// Matches a sub-expression at least `m` times and at most `n` times. |
1429 | Bounded(u32, u32), |
1430 | } |
1431 | |
1432 | /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack |
1433 | /// space but heap space proportional to the depth of the total `Hir`. |
1434 | impl Drop for Hir { |
1435 | fn drop(&mut self) { |
1436 | use std::mem; |
1437 | |
1438 | match *self.kind() { |
1439 | HirKind::Empty |
1440 | | HirKind::Literal(_) |
1441 | | HirKind::Class(_) |
1442 | | HirKind::Anchor(_) |
1443 | | HirKind::WordBoundary(_) => return, |
1444 | HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return, |
1445 | HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return, |
1446 | HirKind::Concat(ref x) if x.is_empty() => return, |
1447 | HirKind::Alternation(ref x) if x.is_empty() => return, |
1448 | _ => {} |
1449 | } |
1450 | |
1451 | let mut stack = vec![mem::replace(self, Hir::empty())]; |
1452 | while let Some(mut expr) = stack.pop() { |
1453 | match expr.kind { |
1454 | HirKind::Empty |
1455 | | HirKind::Literal(_) |
1456 | | HirKind::Class(_) |
1457 | | HirKind::Anchor(_) |
1458 | | HirKind::WordBoundary(_) => {} |
1459 | HirKind::Group(ref mut x) => { |
1460 | stack.push(mem::replace(&mut x.hir, Hir::empty())); |
1461 | } |
1462 | HirKind::Repetition(ref mut x) => { |
1463 | stack.push(mem::replace(&mut x.hir, Hir::empty())); |
1464 | } |
1465 | HirKind::Concat(ref mut x) => { |
1466 | stack.extend(x.drain(..)); |
1467 | } |
1468 | HirKind::Alternation(ref mut x) => { |
1469 | stack.extend(x.drain(..)); |
1470 | } |
1471 | } |
1472 | } |
1473 | } |
1474 | } |
1475 | |
1476 | /// A type that documents various attributes of an HIR expression. |
1477 | /// |
1478 | /// These attributes are typically defined inductively on the HIR. |
1479 | #[derive(Clone, Debug, Eq, PartialEq)] |
1480 | struct HirInfo { |
1481 | /// Represent yes/no questions by a bitfield to conserve space, since |
1482 | /// this is included in every HIR expression. |
1483 | /// |
1484 | /// If more attributes need to be added, it is OK to increase the size of |
1485 | /// this as appropriate. |
1486 | bools: u16, |
1487 | } |
1488 | |
1489 | // A simple macro for defining bitfield accessors/mutators. |
1490 | macro_rules! define_bool { |
1491 | ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { |
1492 | fn $is_fn_name(&self) -> bool { |
1493 | self.bools & (0b1 << $bit) > 0 |
1494 | } |
1495 | |
1496 | fn $set_fn_name(&mut self, yes: bool) { |
1497 | if yes { |
1498 | self.bools |= 1 << $bit; |
1499 | } else { |
1500 | self.bools &= !(1 << $bit); |
1501 | } |
1502 | } |
1503 | }; |
1504 | } |
1505 | |
1506 | impl HirInfo { |
1507 | fn new() -> HirInfo { |
1508 | HirInfo { bools: 0 } |
1509 | } |
1510 | |
1511 | define_bool!(0, is_always_utf8, set_always_utf8); |
1512 | define_bool!(1, is_all_assertions, set_all_assertions); |
1513 | define_bool!(2, is_anchored_start, set_anchored_start); |
1514 | define_bool!(3, is_anchored_end, set_anchored_end); |
1515 | define_bool!(4, is_line_anchored_start, set_line_anchored_start); |
1516 | define_bool!(5, is_line_anchored_end, set_line_anchored_end); |
1517 | define_bool!(6, is_any_anchored_start, set_any_anchored_start); |
1518 | define_bool!(7, is_any_anchored_end, set_any_anchored_end); |
1519 | define_bool!(8, is_match_empty, set_match_empty); |
1520 | define_bool!(9, is_literal, set_literal); |
1521 | define_bool!(10, is_alternation_literal, set_alternation_literal); |
1522 | } |
1523 | |
1524 | #[cfg (test)] |
1525 | mod tests { |
1526 | use super::*; |
1527 | |
1528 | fn uclass(ranges: &[(char, char)]) -> ClassUnicode { |
1529 | let ranges: Vec<ClassUnicodeRange> = ranges |
1530 | .iter() |
1531 | .map(|&(s, e)| ClassUnicodeRange::new(s, e)) |
1532 | .collect(); |
1533 | ClassUnicode::new(ranges) |
1534 | } |
1535 | |
1536 | fn bclass(ranges: &[(u8, u8)]) -> ClassBytes { |
1537 | let ranges: Vec<ClassBytesRange> = |
1538 | ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect(); |
1539 | ClassBytes::new(ranges) |
1540 | } |
1541 | |
1542 | fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> { |
1543 | cls.iter().map(|x| (x.start(), x.end())).collect() |
1544 | } |
1545 | |
1546 | #[cfg (feature = "unicode-case" )] |
1547 | fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { |
1548 | let mut cls_ = cls.clone(); |
1549 | cls_.case_fold_simple(); |
1550 | cls_ |
1551 | } |
1552 | |
1553 | fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { |
1554 | let mut cls_ = cls1.clone(); |
1555 | cls_.union(cls2); |
1556 | cls_ |
1557 | } |
1558 | |
1559 | fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { |
1560 | let mut cls_ = cls1.clone(); |
1561 | cls_.intersect(cls2); |
1562 | cls_ |
1563 | } |
1564 | |
1565 | fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { |
1566 | let mut cls_ = cls1.clone(); |
1567 | cls_.difference(cls2); |
1568 | cls_ |
1569 | } |
1570 | |
1571 | fn usymdifference( |
1572 | cls1: &ClassUnicode, |
1573 | cls2: &ClassUnicode, |
1574 | ) -> ClassUnicode { |
1575 | let mut cls_ = cls1.clone(); |
1576 | cls_.symmetric_difference(cls2); |
1577 | cls_ |
1578 | } |
1579 | |
1580 | fn unegate(cls: &ClassUnicode) -> ClassUnicode { |
1581 | let mut cls_ = cls.clone(); |
1582 | cls_.negate(); |
1583 | cls_ |
1584 | } |
1585 | |
1586 | fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> { |
1587 | cls.iter().map(|x| (x.start(), x.end())).collect() |
1588 | } |
1589 | |
1590 | fn bcasefold(cls: &ClassBytes) -> ClassBytes { |
1591 | let mut cls_ = cls.clone(); |
1592 | cls_.case_fold_simple(); |
1593 | cls_ |
1594 | } |
1595 | |
1596 | fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { |
1597 | let mut cls_ = cls1.clone(); |
1598 | cls_.union(cls2); |
1599 | cls_ |
1600 | } |
1601 | |
1602 | fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { |
1603 | let mut cls_ = cls1.clone(); |
1604 | cls_.intersect(cls2); |
1605 | cls_ |
1606 | } |
1607 | |
1608 | fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { |
1609 | let mut cls_ = cls1.clone(); |
1610 | cls_.difference(cls2); |
1611 | cls_ |
1612 | } |
1613 | |
1614 | fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { |
1615 | let mut cls_ = cls1.clone(); |
1616 | cls_.symmetric_difference(cls2); |
1617 | cls_ |
1618 | } |
1619 | |
1620 | fn bnegate(cls: &ClassBytes) -> ClassBytes { |
1621 | let mut cls_ = cls.clone(); |
1622 | cls_.negate(); |
1623 | cls_ |
1624 | } |
1625 | |
1626 | #[test] |
1627 | fn class_range_canonical_unicode() { |
1628 | let range = ClassUnicodeRange::new(' \u{00FF}' , ' \0' ); |
1629 | assert_eq!(' \0' , range.start()); |
1630 | assert_eq!(' \u{00FF}' , range.end()); |
1631 | } |
1632 | |
1633 | #[test] |
1634 | fn class_range_canonical_bytes() { |
1635 | let range = ClassBytesRange::new(b' \xFF' , b' \0' ); |
1636 | assert_eq!(b' \0' , range.start()); |
1637 | assert_eq!(b' \xFF' , range.end()); |
1638 | } |
1639 | |
1640 | #[test] |
1641 | fn class_canonicalize_unicode() { |
1642 | let cls = uclass(&[('a' , 'c' ), ('x' , 'z' )]); |
1643 | let expected = vec![('a' , 'c' ), ('x' , 'z' )]; |
1644 | assert_eq!(expected, uranges(&cls)); |
1645 | |
1646 | let cls = uclass(&[('x' , 'z' ), ('a' , 'c' )]); |
1647 | let expected = vec![('a' , 'c' ), ('x' , 'z' )]; |
1648 | assert_eq!(expected, uranges(&cls)); |
1649 | |
1650 | let cls = uclass(&[('x' , 'z' ), ('w' , 'y' )]); |
1651 | let expected = vec![('w' , 'z' )]; |
1652 | assert_eq!(expected, uranges(&cls)); |
1653 | |
1654 | let cls = uclass(&[ |
1655 | ('c' , 'f' ), |
1656 | ('a' , 'g' ), |
1657 | ('d' , 'j' ), |
1658 | ('a' , 'c' ), |
1659 | ('m' , 'p' ), |
1660 | ('l' , 's' ), |
1661 | ]); |
1662 | let expected = vec![('a' , 'j' ), ('l' , 's' )]; |
1663 | assert_eq!(expected, uranges(&cls)); |
1664 | |
1665 | let cls = uclass(&[('x' , 'z' ), ('u' , 'w' )]); |
1666 | let expected = vec![('u' , 'z' )]; |
1667 | assert_eq!(expected, uranges(&cls)); |
1668 | |
1669 | let cls = uclass(&[(' \x00' , ' \u{10FFFF}' ), (' \x00' , ' \u{10FFFF}' )]); |
1670 | let expected = vec![(' \x00' , ' \u{10FFFF}' )]; |
1671 | assert_eq!(expected, uranges(&cls)); |
1672 | |
1673 | let cls = uclass(&[('a' , 'a' ), ('b' , 'b' )]); |
1674 | let expected = vec![('a' , 'b' )]; |
1675 | assert_eq!(expected, uranges(&cls)); |
1676 | } |
1677 | |
1678 | #[test] |
1679 | fn class_canonicalize_bytes() { |
1680 | let cls = bclass(&[(b'a' , b'c' ), (b'x' , b'z' )]); |
1681 | let expected = vec![(b'a' , b'c' ), (b'x' , b'z' )]; |
1682 | assert_eq!(expected, branges(&cls)); |
1683 | |
1684 | let cls = bclass(&[(b'x' , b'z' ), (b'a' , b'c' )]); |
1685 | let expected = vec![(b'a' , b'c' ), (b'x' , b'z' )]; |
1686 | assert_eq!(expected, branges(&cls)); |
1687 | |
1688 | let cls = bclass(&[(b'x' , b'z' ), (b'w' , b'y' )]); |
1689 | let expected = vec![(b'w' , b'z' )]; |
1690 | assert_eq!(expected, branges(&cls)); |
1691 | |
1692 | let cls = bclass(&[ |
1693 | (b'c' , b'f' ), |
1694 | (b'a' , b'g' ), |
1695 | (b'd' , b'j' ), |
1696 | (b'a' , b'c' ), |
1697 | (b'm' , b'p' ), |
1698 | (b'l' , b's' ), |
1699 | ]); |
1700 | let expected = vec![(b'a' , b'j' ), (b'l' , b's' )]; |
1701 | assert_eq!(expected, branges(&cls)); |
1702 | |
1703 | let cls = bclass(&[(b'x' , b'z' ), (b'u' , b'w' )]); |
1704 | let expected = vec![(b'u' , b'z' )]; |
1705 | assert_eq!(expected, branges(&cls)); |
1706 | |
1707 | let cls = bclass(&[(b' \x00' , b' \xFF' ), (b' \x00' , b' \xFF' )]); |
1708 | let expected = vec![(b' \x00' , b' \xFF' )]; |
1709 | assert_eq!(expected, branges(&cls)); |
1710 | |
1711 | let cls = bclass(&[(b'a' , b'a' ), (b'b' , b'b' )]); |
1712 | let expected = vec![(b'a' , b'b' )]; |
1713 | assert_eq!(expected, branges(&cls)); |
1714 | } |
1715 | |
1716 | #[test] |
1717 | #[cfg (feature = "unicode-case" )] |
1718 | fn class_case_fold_unicode() { |
1719 | let cls = uclass(&[ |
1720 | ('C' , 'F' ), |
1721 | ('A' , 'G' ), |
1722 | ('D' , 'J' ), |
1723 | ('A' , 'C' ), |
1724 | ('M' , 'P' ), |
1725 | ('L' , 'S' ), |
1726 | ('c' , 'f' ), |
1727 | ]); |
1728 | let expected = uclass(&[ |
1729 | ('A' , 'J' ), |
1730 | ('L' , 'S' ), |
1731 | ('a' , 'j' ), |
1732 | ('l' , 's' ), |
1733 | (' \u{17F}' , ' \u{17F}' ), |
1734 | ]); |
1735 | assert_eq!(expected, ucasefold(&cls)); |
1736 | |
1737 | let cls = uclass(&[('A' , 'Z' )]); |
1738 | let expected = uclass(&[ |
1739 | ('A' , 'Z' ), |
1740 | ('a' , 'z' ), |
1741 | (' \u{17F}' , ' \u{17F}' ), |
1742 | (' \u{212A}' , ' \u{212A}' ), |
1743 | ]); |
1744 | assert_eq!(expected, ucasefold(&cls)); |
1745 | |
1746 | let cls = uclass(&[('a' , 'z' )]); |
1747 | let expected = uclass(&[ |
1748 | ('A' , 'Z' ), |
1749 | ('a' , 'z' ), |
1750 | (' \u{17F}' , ' \u{17F}' ), |
1751 | (' \u{212A}' , ' \u{212A}' ), |
1752 | ]); |
1753 | assert_eq!(expected, ucasefold(&cls)); |
1754 | |
1755 | let cls = uclass(&[('A' , 'A' ), ('_' , '_' )]); |
1756 | let expected = uclass(&[('A' , 'A' ), ('_' , '_' ), ('a' , 'a' )]); |
1757 | assert_eq!(expected, ucasefold(&cls)); |
1758 | |
1759 | let cls = uclass(&[('A' , 'A' ), ('=' , '=' )]); |
1760 | let expected = uclass(&[('=' , '=' ), ('A' , 'A' ), ('a' , 'a' )]); |
1761 | assert_eq!(expected, ucasefold(&cls)); |
1762 | |
1763 | let cls = uclass(&[(' \x00' , ' \x10' )]); |
1764 | assert_eq!(cls, ucasefold(&cls)); |
1765 | |
1766 | let cls = uclass(&[('k' , 'k' )]); |
1767 | let expected = |
1768 | uclass(&[('K' , 'K' ), ('k' , 'k' ), (' \u{212A}' , ' \u{212A}' )]); |
1769 | assert_eq!(expected, ucasefold(&cls)); |
1770 | |
1771 | let cls = uclass(&[('@' , '@' )]); |
1772 | assert_eq!(cls, ucasefold(&cls)); |
1773 | } |
1774 | |
1775 | #[test] |
1776 | #[cfg (not(feature = "unicode-case" ))] |
1777 | fn class_case_fold_unicode_disabled() { |
1778 | let mut cls = uclass(&[ |
1779 | ('C' , 'F' ), |
1780 | ('A' , 'G' ), |
1781 | ('D' , 'J' ), |
1782 | ('A' , 'C' ), |
1783 | ('M' , 'P' ), |
1784 | ('L' , 'S' ), |
1785 | ('c' , 'f' ), |
1786 | ]); |
1787 | assert!(cls.try_case_fold_simple().is_err()); |
1788 | } |
1789 | |
1790 | #[test] |
1791 | #[should_panic ] |
1792 | #[cfg (not(feature = "unicode-case" ))] |
1793 | fn class_case_fold_unicode_disabled_panics() { |
1794 | let mut cls = uclass(&[ |
1795 | ('C' , 'F' ), |
1796 | ('A' , 'G' ), |
1797 | ('D' , 'J' ), |
1798 | ('A' , 'C' ), |
1799 | ('M' , 'P' ), |
1800 | ('L' , 'S' ), |
1801 | ('c' , 'f' ), |
1802 | ]); |
1803 | cls.case_fold_simple(); |
1804 | } |
1805 | |
1806 | #[test] |
1807 | fn class_case_fold_bytes() { |
1808 | let cls = bclass(&[ |
1809 | (b'C' , b'F' ), |
1810 | (b'A' , b'G' ), |
1811 | (b'D' , b'J' ), |
1812 | (b'A' , b'C' ), |
1813 | (b'M' , b'P' ), |
1814 | (b'L' , b'S' ), |
1815 | (b'c' , b'f' ), |
1816 | ]); |
1817 | let expected = |
1818 | bclass(&[(b'A' , b'J' ), (b'L' , b'S' ), (b'a' , b'j' ), (b'l' , b's' )]); |
1819 | assert_eq!(expected, bcasefold(&cls)); |
1820 | |
1821 | let cls = bclass(&[(b'A' , b'Z' )]); |
1822 | let expected = bclass(&[(b'A' , b'Z' ), (b'a' , b'z' )]); |
1823 | assert_eq!(expected, bcasefold(&cls)); |
1824 | |
1825 | let cls = bclass(&[(b'a' , b'z' )]); |
1826 | let expected = bclass(&[(b'A' , b'Z' ), (b'a' , b'z' )]); |
1827 | assert_eq!(expected, bcasefold(&cls)); |
1828 | |
1829 | let cls = bclass(&[(b'A' , b'A' ), (b'_' , b'_' )]); |
1830 | let expected = bclass(&[(b'A' , b'A' ), (b'_' , b'_' ), (b'a' , b'a' )]); |
1831 | assert_eq!(expected, bcasefold(&cls)); |
1832 | |
1833 | let cls = bclass(&[(b'A' , b'A' ), (b'=' , b'=' )]); |
1834 | let expected = bclass(&[(b'=' , b'=' ), (b'A' , b'A' ), (b'a' , b'a' )]); |
1835 | assert_eq!(expected, bcasefold(&cls)); |
1836 | |
1837 | let cls = bclass(&[(b' \x00' , b' \x10' )]); |
1838 | assert_eq!(cls, bcasefold(&cls)); |
1839 | |
1840 | let cls = bclass(&[(b'k' , b'k' )]); |
1841 | let expected = bclass(&[(b'K' , b'K' ), (b'k' , b'k' )]); |
1842 | assert_eq!(expected, bcasefold(&cls)); |
1843 | |
1844 | let cls = bclass(&[(b'@' , b'@' )]); |
1845 | assert_eq!(cls, bcasefold(&cls)); |
1846 | } |
1847 | |
1848 | #[test] |
1849 | fn class_negate_unicode() { |
1850 | let cls = uclass(&[('a' , 'a' )]); |
1851 | let expected = uclass(&[(' \x00' , ' \x60' ), (' \x62' , ' \u{10FFFF}' )]); |
1852 | assert_eq!(expected, unegate(&cls)); |
1853 | |
1854 | let cls = uclass(&[('a' , 'a' ), ('b' , 'b' )]); |
1855 | let expected = uclass(&[(' \x00' , ' \x60' ), (' \x63' , ' \u{10FFFF}' )]); |
1856 | assert_eq!(expected, unegate(&cls)); |
1857 | |
1858 | let cls = uclass(&[('a' , 'c' ), ('x' , 'z' )]); |
1859 | let expected = uclass(&[ |
1860 | (' \x00' , ' \x60' ), |
1861 | (' \x64' , ' \x77' ), |
1862 | (' \x7B' , ' \u{10FFFF}' ), |
1863 | ]); |
1864 | assert_eq!(expected, unegate(&cls)); |
1865 | |
1866 | let cls = uclass(&[(' \x00' , 'a' )]); |
1867 | let expected = uclass(&[(' \x62' , ' \u{10FFFF}' )]); |
1868 | assert_eq!(expected, unegate(&cls)); |
1869 | |
1870 | let cls = uclass(&[('a' , ' \u{10FFFF}' )]); |
1871 | let expected = uclass(&[(' \x00' , ' \x60' )]); |
1872 | assert_eq!(expected, unegate(&cls)); |
1873 | |
1874 | let cls = uclass(&[(' \x00' , ' \u{10FFFF}' )]); |
1875 | let expected = uclass(&[]); |
1876 | assert_eq!(expected, unegate(&cls)); |
1877 | |
1878 | let cls = uclass(&[]); |
1879 | let expected = uclass(&[(' \x00' , ' \u{10FFFF}' )]); |
1880 | assert_eq!(expected, unegate(&cls)); |
1881 | |
1882 | let cls = |
1883 | uclass(&[(' \x00' , ' \u{10FFFD}' ), (' \u{10FFFF}' , ' \u{10FFFF}' )]); |
1884 | let expected = uclass(&[(' \u{10FFFE}' , ' \u{10FFFE}' )]); |
1885 | assert_eq!(expected, unegate(&cls)); |
1886 | |
1887 | let cls = uclass(&[(' \x00' , ' \u{D7FF}' )]); |
1888 | let expected = uclass(&[(' \u{E000}' , ' \u{10FFFF}' )]); |
1889 | assert_eq!(expected, unegate(&cls)); |
1890 | |
1891 | let cls = uclass(&[(' \x00' , ' \u{D7FE}' )]); |
1892 | let expected = uclass(&[(' \u{D7FF}' , ' \u{10FFFF}' )]); |
1893 | assert_eq!(expected, unegate(&cls)); |
1894 | |
1895 | let cls = uclass(&[(' \u{E000}' , ' \u{10FFFF}' )]); |
1896 | let expected = uclass(&[(' \x00' , ' \u{D7FF}' )]); |
1897 | assert_eq!(expected, unegate(&cls)); |
1898 | |
1899 | let cls = uclass(&[(' \u{E001}' , ' \u{10FFFF}' )]); |
1900 | let expected = uclass(&[(' \x00' , ' \u{E000}' )]); |
1901 | assert_eq!(expected, unegate(&cls)); |
1902 | } |
1903 | |
1904 | #[test] |
1905 | fn class_negate_bytes() { |
1906 | let cls = bclass(&[(b'a' , b'a' )]); |
1907 | let expected = bclass(&[(b' \x00' , b' \x60' ), (b' \x62' , b' \xFF' )]); |
1908 | assert_eq!(expected, bnegate(&cls)); |
1909 | |
1910 | let cls = bclass(&[(b'a' , b'a' ), (b'b' , b'b' )]); |
1911 | let expected = bclass(&[(b' \x00' , b' \x60' ), (b' \x63' , b' \xFF' )]); |
1912 | assert_eq!(expected, bnegate(&cls)); |
1913 | |
1914 | let cls = bclass(&[(b'a' , b'c' ), (b'x' , b'z' )]); |
1915 | let expected = bclass(&[ |
1916 | (b' \x00' , b' \x60' ), |
1917 | (b' \x64' , b' \x77' ), |
1918 | (b' \x7B' , b' \xFF' ), |
1919 | ]); |
1920 | assert_eq!(expected, bnegate(&cls)); |
1921 | |
1922 | let cls = bclass(&[(b' \x00' , b'a' )]); |
1923 | let expected = bclass(&[(b' \x62' , b' \xFF' )]); |
1924 | assert_eq!(expected, bnegate(&cls)); |
1925 | |
1926 | let cls = bclass(&[(b'a' , b' \xFF' )]); |
1927 | let expected = bclass(&[(b' \x00' , b' \x60' )]); |
1928 | assert_eq!(expected, bnegate(&cls)); |
1929 | |
1930 | let cls = bclass(&[(b' \x00' , b' \xFF' )]); |
1931 | let expected = bclass(&[]); |
1932 | assert_eq!(expected, bnegate(&cls)); |
1933 | |
1934 | let cls = bclass(&[]); |
1935 | let expected = bclass(&[(b' \x00' , b' \xFF' )]); |
1936 | assert_eq!(expected, bnegate(&cls)); |
1937 | |
1938 | let cls = bclass(&[(b' \x00' , b' \xFD' ), (b' \xFF' , b' \xFF' )]); |
1939 | let expected = bclass(&[(b' \xFE' , b' \xFE' )]); |
1940 | assert_eq!(expected, bnegate(&cls)); |
1941 | } |
1942 | |
1943 | #[test] |
1944 | fn class_union_unicode() { |
1945 | let cls1 = uclass(&[('a' , 'g' ), ('m' , 't' ), ('A' , 'C' )]); |
1946 | let cls2 = uclass(&[('a' , 'z' )]); |
1947 | let expected = uclass(&[('a' , 'z' ), ('A' , 'C' )]); |
1948 | assert_eq!(expected, uunion(&cls1, &cls2)); |
1949 | } |
1950 | |
1951 | #[test] |
1952 | fn class_union_bytes() { |
1953 | let cls1 = bclass(&[(b'a' , b'g' ), (b'm' , b't' ), (b'A' , b'C' )]); |
1954 | let cls2 = bclass(&[(b'a' , b'z' )]); |
1955 | let expected = bclass(&[(b'a' , b'z' ), (b'A' , b'C' )]); |
1956 | assert_eq!(expected, bunion(&cls1, &cls2)); |
1957 | } |
1958 | |
1959 | #[test] |
1960 | fn class_intersect_unicode() { |
1961 | let cls1 = uclass(&[]); |
1962 | let cls2 = uclass(&[('a' , 'a' )]); |
1963 | let expected = uclass(&[]); |
1964 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
1965 | |
1966 | let cls1 = uclass(&[('a' , 'a' )]); |
1967 | let cls2 = uclass(&[('a' , 'a' )]); |
1968 | let expected = uclass(&[('a' , 'a' )]); |
1969 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
1970 | |
1971 | let cls1 = uclass(&[('a' , 'a' )]); |
1972 | let cls2 = uclass(&[('b' , 'b' )]); |
1973 | let expected = uclass(&[]); |
1974 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
1975 | |
1976 | let cls1 = uclass(&[('a' , 'a' )]); |
1977 | let cls2 = uclass(&[('a' , 'c' )]); |
1978 | let expected = uclass(&[('a' , 'a' )]); |
1979 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
1980 | |
1981 | let cls1 = uclass(&[('a' , 'b' )]); |
1982 | let cls2 = uclass(&[('a' , 'c' )]); |
1983 | let expected = uclass(&[('a' , 'b' )]); |
1984 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
1985 | |
1986 | let cls1 = uclass(&[('a' , 'b' )]); |
1987 | let cls2 = uclass(&[('b' , 'c' )]); |
1988 | let expected = uclass(&[('b' , 'b' )]); |
1989 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
1990 | |
1991 | let cls1 = uclass(&[('a' , 'b' )]); |
1992 | let cls2 = uclass(&[('c' , 'd' )]); |
1993 | let expected = uclass(&[]); |
1994 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
1995 | |
1996 | let cls1 = uclass(&[('b' , 'c' )]); |
1997 | let cls2 = uclass(&[('a' , 'd' )]); |
1998 | let expected = uclass(&[('b' , 'c' )]); |
1999 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
2000 | |
2001 | let cls1 = uclass(&[('a' , 'b' ), ('d' , 'e' ), ('g' , 'h' )]); |
2002 | let cls2 = uclass(&[('a' , 'h' )]); |
2003 | let expected = uclass(&[('a' , 'b' ), ('d' , 'e' ), ('g' , 'h' )]); |
2004 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
2005 | |
2006 | let cls1 = uclass(&[('a' , 'b' ), ('d' , 'e' ), ('g' , 'h' )]); |
2007 | let cls2 = uclass(&[('a' , 'b' ), ('d' , 'e' ), ('g' , 'h' )]); |
2008 | let expected = uclass(&[('a' , 'b' ), ('d' , 'e' ), ('g' , 'h' )]); |
2009 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
2010 | |
2011 | let cls1 = uclass(&[('a' , 'b' ), ('g' , 'h' )]); |
2012 | let cls2 = uclass(&[('d' , 'e' ), ('k' , 'l' )]); |
2013 | let expected = uclass(&[]); |
2014 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
2015 | |
2016 | let cls1 = uclass(&[('a' , 'b' ), ('d' , 'e' ), ('g' , 'h' )]); |
2017 | let cls2 = uclass(&[('h' , 'h' )]); |
2018 | let expected = uclass(&[('h' , 'h' )]); |
2019 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
2020 | |
2021 | let cls1 = uclass(&[('a' , 'b' ), ('e' , 'f' ), ('i' , 'j' )]); |
2022 | let cls2 = uclass(&[('c' , 'd' ), ('g' , 'h' ), ('k' , 'l' )]); |
2023 | let expected = uclass(&[]); |
2024 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
2025 | |
2026 | let cls1 = uclass(&[('a' , 'b' ), ('c' , 'd' ), ('e' , 'f' )]); |
2027 | let cls2 = uclass(&[('b' , 'c' ), ('d' , 'e' ), ('f' , 'g' )]); |
2028 | let expected = uclass(&[('b' , 'f' )]); |
2029 | assert_eq!(expected, uintersect(&cls1, &cls2)); |
2030 | } |
2031 | |
2032 | #[test] |
2033 | fn class_intersect_bytes() { |
2034 | let cls1 = bclass(&[]); |
2035 | let cls2 = bclass(&[(b'a' , b'a' )]); |
2036 | let expected = bclass(&[]); |
2037 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2038 | |
2039 | let cls1 = bclass(&[(b'a' , b'a' )]); |
2040 | let cls2 = bclass(&[(b'a' , b'a' )]); |
2041 | let expected = bclass(&[(b'a' , b'a' )]); |
2042 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2043 | |
2044 | let cls1 = bclass(&[(b'a' , b'a' )]); |
2045 | let cls2 = bclass(&[(b'b' , b'b' )]); |
2046 | let expected = bclass(&[]); |
2047 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2048 | |
2049 | let cls1 = bclass(&[(b'a' , b'a' )]); |
2050 | let cls2 = bclass(&[(b'a' , b'c' )]); |
2051 | let expected = bclass(&[(b'a' , b'a' )]); |
2052 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2053 | |
2054 | let cls1 = bclass(&[(b'a' , b'b' )]); |
2055 | let cls2 = bclass(&[(b'a' , b'c' )]); |
2056 | let expected = bclass(&[(b'a' , b'b' )]); |
2057 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2058 | |
2059 | let cls1 = bclass(&[(b'a' , b'b' )]); |
2060 | let cls2 = bclass(&[(b'b' , b'c' )]); |
2061 | let expected = bclass(&[(b'b' , b'b' )]); |
2062 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2063 | |
2064 | let cls1 = bclass(&[(b'a' , b'b' )]); |
2065 | let cls2 = bclass(&[(b'c' , b'd' )]); |
2066 | let expected = bclass(&[]); |
2067 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2068 | |
2069 | let cls1 = bclass(&[(b'b' , b'c' )]); |
2070 | let cls2 = bclass(&[(b'a' , b'd' )]); |
2071 | let expected = bclass(&[(b'b' , b'c' )]); |
2072 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2073 | |
2074 | let cls1 = bclass(&[(b'a' , b'b' ), (b'd' , b'e' ), (b'g' , b'h' )]); |
2075 | let cls2 = bclass(&[(b'a' , b'h' )]); |
2076 | let expected = bclass(&[(b'a' , b'b' ), (b'd' , b'e' ), (b'g' , b'h' )]); |
2077 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2078 | |
2079 | let cls1 = bclass(&[(b'a' , b'b' ), (b'd' , b'e' ), (b'g' , b'h' )]); |
2080 | let cls2 = bclass(&[(b'a' , b'b' ), (b'd' , b'e' ), (b'g' , b'h' )]); |
2081 | let expected = bclass(&[(b'a' , b'b' ), (b'd' , b'e' ), (b'g' , b'h' )]); |
2082 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2083 | |
2084 | let cls1 = bclass(&[(b'a' , b'b' ), (b'g' , b'h' )]); |
2085 | let cls2 = bclass(&[(b'd' , b'e' ), (b'k' , b'l' )]); |
2086 | let expected = bclass(&[]); |
2087 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2088 | |
2089 | let cls1 = bclass(&[(b'a' , b'b' ), (b'd' , b'e' ), (b'g' , b'h' )]); |
2090 | let cls2 = bclass(&[(b'h' , b'h' )]); |
2091 | let expected = bclass(&[(b'h' , b'h' )]); |
2092 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2093 | |
2094 | let cls1 = bclass(&[(b'a' , b'b' ), (b'e' , b'f' ), (b'i' , b'j' )]); |
2095 | let cls2 = bclass(&[(b'c' , b'd' ), (b'g' , b'h' ), (b'k' , b'l' )]); |
2096 | let expected = bclass(&[]); |
2097 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2098 | |
2099 | let cls1 = bclass(&[(b'a' , b'b' ), (b'c' , b'd' ), (b'e' , b'f' )]); |
2100 | let cls2 = bclass(&[(b'b' , b'c' ), (b'd' , b'e' ), (b'f' , b'g' )]); |
2101 | let expected = bclass(&[(b'b' , b'f' )]); |
2102 | assert_eq!(expected, bintersect(&cls1, &cls2)); |
2103 | } |
2104 | |
2105 | #[test] |
2106 | fn class_difference_unicode() { |
2107 | let cls1 = uclass(&[('a' , 'a' )]); |
2108 | let cls2 = uclass(&[('a' , 'a' )]); |
2109 | let expected = uclass(&[]); |
2110 | assert_eq!(expected, udifference(&cls1, &cls2)); |
2111 | |
2112 | let cls1 = uclass(&[('a' , 'a' )]); |
2113 | let cls2 = uclass(&[]); |
2114 | let expected = uclass(&[('a' , 'a' )]); |
2115 | assert_eq!(expected, udifference(&cls1, &cls2)); |
2116 | |
2117 | let cls1 = uclass(&[]); |
2118 | let cls2 = uclass(&[('a' , 'a' )]); |
2119 | let expected = uclass(&[]); |
2120 | assert_eq!(expected, udifference(&cls1, &cls2)); |
2121 | |
2122 | let cls1 = uclass(&[('a' , 'z' )]); |
2123 | let cls2 = uclass(&[('a' , 'a' )]); |
2124 | let expected = uclass(&[('b' , 'z' )]); |
2125 | assert_eq!(expected, udifference(&cls1, &cls2)); |
2126 | |
2127 | let cls1 = uclass(&[('a' , 'z' )]); |
2128 | let cls2 = uclass(&[('z' , 'z' )]); |
2129 | let expected = uclass(&[('a' , 'y' )]); |
2130 | assert_eq!(expected, udifference(&cls1, &cls2)); |
2131 | |
2132 | let cls1 = uclass(&[('a' , 'z' )]); |
2133 | let cls2 = uclass(&[('m' , 'm' )]); |
2134 | let expected = uclass(&[('a' , 'l' ), ('n' , 'z' )]); |
2135 | assert_eq!(expected, udifference(&cls1, &cls2)); |
2136 | |
2137 | let cls1 = uclass(&[('a' , 'c' ), ('g' , 'i' ), ('r' , 't' )]); |
2138 | let cls2 = uclass(&[('a' , 'z' )]); |
2139 | let expected = uclass(&[]); |
2140 | assert_eq!(expected, udifference(&cls1, &cls2)); |
2141 | |
2142 | let cls1 = uclass(&[('a' , 'c' ), ('g' , 'i' ), ('r' , 't' )]); |
2143 | let cls2 = uclass(&[('d' , 'v' )]); |
2144 | let expected = uclass(&[('a' , 'c' )]); |
2145 | assert_eq!(expected, udifference(&cls1, &cls2)); |
2146 | |
2147 | let cls1 = uclass(&[('a' , 'c' ), ('g' , 'i' ), ('r' , 't' )]); |
2148 | let cls2 = uclass(&[('b' , 'g' ), ('s' , 'u' )]); |
2149 | let expected = uclass(&[('a' , 'a' ), ('h' , 'i' ), ('r' , 'r' )]); |
2150 | assert_eq!(expected, udifference(&cls1, &cls2)); |
2151 | |
2152 | let cls1 = uclass(&[('a' , 'c' ), ('g' , 'i' ), ('r' , 't' )]); |
2153 | let cls2 = uclass(&[('b' , 'd' ), ('e' , 'g' ), ('s' , 'u' )]); |
2154 | let expected = uclass(&[('a' , 'a' ), ('h' , 'i' ), ('r' , 'r' )]); |
2155 | assert_eq!(expected, udifference(&cls1, &cls2)); |
2156 | |
2157 | let cls1 = uclass(&[('x' , 'z' )]); |
2158 | let cls2 = uclass(&[('a' , 'c' ), ('e' , 'g' ), ('s' , 'u' )]); |
2159 | let expected = uclass(&[('x' , 'z' )]); |
2160 | assert_eq!(expected, udifference(&cls1, &cls2)); |
2161 | |
2162 | let cls1 = uclass(&[('a' , 'z' )]); |
2163 | let cls2 = uclass(&[('a' , 'c' ), ('e' , 'g' ), ('s' , 'u' )]); |
2164 | let expected = uclass(&[('d' , 'd' ), ('h' , 'r' ), ('v' , 'z' )]); |
2165 | assert_eq!(expected, udifference(&cls1, &cls2)); |
2166 | } |
2167 | |
2168 | #[test] |
2169 | fn class_difference_bytes() { |
2170 | let cls1 = bclass(&[(b'a' , b'a' )]); |
2171 | let cls2 = bclass(&[(b'a' , b'a' )]); |
2172 | let expected = bclass(&[]); |
2173 | assert_eq!(expected, bdifference(&cls1, &cls2)); |
2174 | |
2175 | let cls1 = bclass(&[(b'a' , b'a' )]); |
2176 | let cls2 = bclass(&[]); |
2177 | let expected = bclass(&[(b'a' , b'a' )]); |
2178 | assert_eq!(expected, bdifference(&cls1, &cls2)); |
2179 | |
2180 | let cls1 = bclass(&[]); |
2181 | let cls2 = bclass(&[(b'a' , b'a' )]); |
2182 | let expected = bclass(&[]); |
2183 | assert_eq!(expected, bdifference(&cls1, &cls2)); |
2184 | |
2185 | let cls1 = bclass(&[(b'a' , b'z' )]); |
2186 | let cls2 = bclass(&[(b'a' , b'a' )]); |
2187 | let expected = bclass(&[(b'b' , b'z' )]); |
2188 | assert_eq!(expected, bdifference(&cls1, &cls2)); |
2189 | |
2190 | let cls1 = bclass(&[(b'a' , b'z' )]); |
2191 | let cls2 = bclass(&[(b'z' , b'z' )]); |
2192 | let expected = bclass(&[(b'a' , b'y' )]); |
2193 | assert_eq!(expected, bdifference(&cls1, &cls2)); |
2194 | |
2195 | let cls1 = bclass(&[(b'a' , b'z' )]); |
2196 | let cls2 = bclass(&[(b'm' , b'm' )]); |
2197 | let expected = bclass(&[(b'a' , b'l' ), (b'n' , b'z' )]); |
2198 | assert_eq!(expected, bdifference(&cls1, &cls2)); |
2199 | |
2200 | let cls1 = bclass(&[(b'a' , b'c' ), (b'g' , b'i' ), (b'r' , b't' )]); |
2201 | let cls2 = bclass(&[(b'a' , b'z' )]); |
2202 | let expected = bclass(&[]); |
2203 | assert_eq!(expected, bdifference(&cls1, &cls2)); |
2204 | |
2205 | let cls1 = bclass(&[(b'a' , b'c' ), (b'g' , b'i' ), (b'r' , b't' )]); |
2206 | let cls2 = bclass(&[(b'd' , b'v' )]); |
2207 | let expected = bclass(&[(b'a' , b'c' )]); |
2208 | assert_eq!(expected, bdifference(&cls1, &cls2)); |
2209 | |
2210 | let cls1 = bclass(&[(b'a' , b'c' ), (b'g' , b'i' ), (b'r' , b't' )]); |
2211 | let cls2 = bclass(&[(b'b' , b'g' ), (b's' , b'u' )]); |
2212 | let expected = bclass(&[(b'a' , b'a' ), (b'h' , b'i' ), (b'r' , b'r' )]); |
2213 | assert_eq!(expected, bdifference(&cls1, &cls2)); |
2214 | |
2215 | let cls1 = bclass(&[(b'a' , b'c' ), (b'g' , b'i' ), (b'r' , b't' )]); |
2216 | let cls2 = bclass(&[(b'b' , b'd' ), (b'e' , b'g' ), (b's' , b'u' )]); |
2217 | let expected = bclass(&[(b'a' , b'a' ), (b'h' , b'i' ), (b'r' , b'r' )]); |
2218 | assert_eq!(expected, bdifference(&cls1, &cls2)); |
2219 | |
2220 | let cls1 = bclass(&[(b'x' , b'z' )]); |
2221 | let cls2 = bclass(&[(b'a' , b'c' ), (b'e' , b'g' ), (b's' , b'u' )]); |
2222 | let expected = bclass(&[(b'x' , b'z' )]); |
2223 | assert_eq!(expected, bdifference(&cls1, &cls2)); |
2224 | |
2225 | let cls1 = bclass(&[(b'a' , b'z' )]); |
2226 | let cls2 = bclass(&[(b'a' , b'c' ), (b'e' , b'g' ), (b's' , b'u' )]); |
2227 | let expected = bclass(&[(b'd' , b'd' ), (b'h' , b'r' ), (b'v' , b'z' )]); |
2228 | assert_eq!(expected, bdifference(&cls1, &cls2)); |
2229 | } |
2230 | |
2231 | #[test] |
2232 | fn class_symmetric_difference_unicode() { |
2233 | let cls1 = uclass(&[('a' , 'm' )]); |
2234 | let cls2 = uclass(&[('g' , 't' )]); |
2235 | let expected = uclass(&[('a' , 'f' ), ('n' , 't' )]); |
2236 | assert_eq!(expected, usymdifference(&cls1, &cls2)); |
2237 | } |
2238 | |
2239 | #[test] |
2240 | fn class_symmetric_difference_bytes() { |
2241 | let cls1 = bclass(&[(b'a' , b'm' )]); |
2242 | let cls2 = bclass(&[(b'g' , b't' )]); |
2243 | let expected = bclass(&[(b'a' , b'f' ), (b'n' , b't' )]); |
2244 | assert_eq!(expected, bsymdifference(&cls1, &cls2)); |
2245 | } |
2246 | |
2247 | #[test] |
2248 | #[should_panic ] |
2249 | fn hir_byte_literal_non_ascii() { |
2250 | Hir::literal(Literal::Byte(b'a' )); |
2251 | } |
2252 | |
2253 | // We use a thread with an explicit stack size to test that our destructor |
2254 | // for Hir can handle arbitrarily sized expressions in constant stack |
2255 | // space. In case we run on a platform without threads (WASM?), we limit |
2256 | // this test to Windows/Unix. |
2257 | #[test] |
2258 | #[cfg (any(unix, windows))] |
2259 | fn no_stack_overflow_on_drop() { |
2260 | use std::thread; |
2261 | |
2262 | let run = || { |
2263 | let mut expr = Hir::empty(); |
2264 | for _ in 0..100 { |
2265 | expr = Hir::group(Group { |
2266 | kind: GroupKind::NonCapturing, |
2267 | hir: Box::new(expr), |
2268 | }); |
2269 | expr = Hir::repetition(Repetition { |
2270 | kind: RepetitionKind::ZeroOrOne, |
2271 | greedy: true, |
2272 | hir: Box::new(expr), |
2273 | }); |
2274 | |
2275 | expr = Hir { |
2276 | kind: HirKind::Concat(vec![expr]), |
2277 | info: HirInfo::new(), |
2278 | }; |
2279 | expr = Hir { |
2280 | kind: HirKind::Alternation(vec![expr]), |
2281 | info: HirInfo::new(), |
2282 | }; |
2283 | } |
2284 | assert!(!expr.kind.is_empty()); |
2285 | }; |
2286 | |
2287 | // We run our test on a thread with a small stack size so we can |
2288 | // force the issue more easily. |
2289 | // |
2290 | // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests' |
2291 | // for context on the specific stack size chosen here. |
2292 | thread::Builder::new() |
2293 | .stack_size(16 << 10) |
2294 | .spawn(run) |
2295 | .unwrap() |
2296 | .join() |
2297 | .unwrap(); |
2298 | } |
2299 | } |
2300 | |